### Summary ####
# Input: cnt20, cnt21, and fmli203 - fmli213
# Output: Output df_all_cu (all households, for Section III analysis of the paper) 
#         and df_f (final sample, for the main analyses)
#         and cnt_cleaned, for summary statistics
# Outline: 
# 1. Merge EIP information into the fmli files 
# 2. Create the basic panel, constructing variables used in the regression analysis
# 3. Update the panel by computing first differences 
# 4. Compute weights, income, and liquidity
# 5. Compute average expenditure at the CU level
# 6. Incorporate variables created in 4 and 5 into the panel, and re-arrange 
# 7. Create "All household sample" and "final sample" 

### 0 Preparations ####
setwd(getwd())

# Open libraries 
library(readxl) # for importing raw data
library(dplyr) # for data processing
library(quantreg) # for quantile regressions used in data cleaning

# Import datasets
cnt20 <- read_excel("Raw data/cnt20.xlsx")
cnt21 <- read_excel("Raw data/cnt21.xlsx")

# Family files needed 
fmli203 <- read_excel("Raw data/fmli203.xlsx")
fmli204 <- read_excel("Raw data/fmli204.xlsx")
fmli211 <- read_excel("Raw data/fmli211.xlsx")
fmli212 <- read_excel("Raw data/fmli212.xlsx")
fmli213 <- read_excel("Raw data/fmli213.xlsx")

# Create copies of raw files 
fmli203_copy <- fmli203
fmli204_copy <- fmli204
fmli211_copy <- fmli211
fmli212_copy <- fmli212
fmli213_copy <- fmli213

# Change QINTRVMO (interview month) to YYMM

fmli203 <- fmli203 %>% mutate(
  YYMM = ifelse(QINTRVMO==7,2007,
                ifelse(QINTRVMO==8,2008,2009))) %>%
  select(-QINTRVMO)

fmli204 <- fmli204 %>% mutate(
  YYMM = ifelse(QINTRVMO==10,2010,
                ifelse(QINTRVMO==11,2011,2012))) %>%
  select(-QINTRVMO)

fmli211 <- fmli211 %>% mutate(
  YYMM = ifelse(QINTRVMO==1,2101,
                ifelse(QINTRVMO==2,2102,2103))) %>%
  select(-QINTRVMO)

fmli212 <- fmli212 %>% mutate(
  YYMM = ifelse(QINTRVMO==4,2104,
                ifelse(QINTRVMO==5,2105,2106))) %>%
  select(-QINTRVMO)

fmli213 <- fmli213 %>% mutate(
  YYMM = ifelse(QINTRVMO==7,2107,
                ifelse(QINTRVMO==8,2108,2109))) %>%
  select(-QINTRVMO)

# Change CONTMO (receipt month) to RYYMM
cnt20 <- cnt20 %>% 
  filter(CONTCODE==800) %>%
  mutate(RYYMM = ifelse(CONTMO==4,2004,
                 ifelse(CONTMO==5,2005,
                 ifelse(CONTMO==6,2006,
                 ifelse(CONTMO==7,2007,
                 ifelse(CONTMO==8,2008,
                 ifelse(CONTMO==9,2009,
                 ifelse(CONTMO==10,2010,
                 ifelse(CONTMO==11,2011,
                 ifelse(CONTMO==12,2012,
                 ifelse(CONTMO==1,2101,2102))))))))))) %>% 
  select(-CONTMO)

# Drop the 5 relevant rebates received as tax refund by 3 CUs
# Because 1) it is unclear which round of EIPs they are, and
# 2) all but one are small in amount (all less than or equal to 600) 
# 3) and are received late  (June to August 2021)
# Note: There are two other rebates received as tax refund reported by two CUs in cnt21
# but they won't be in the sample because of sample restrictions
cnt21 <- cnt21 %>% 
  filter(CONTCODE==800 & (is.na(CHCKEFT)|CHCKEFT!=4)) %>%
  mutate(RYYMM = ifelse(CONTMO==1,2101,
                 ifelse(CONTMO==2,2102,
                 ifelse(CONTMO==3,2103,
                 ifelse(CONTMO==4,2104,
                 ifelse(CONTMO==5,2105,
                 ifelse(CONTMO==6,2106,
                 ifelse(CONTMO==7,2107, 2108)))))))) %>% 
  select(-CONTMO)

# Combine cnt20 and cnt21 
cnt <- bind_rows(cnt20,cnt21)
cnt_copy <- cnt

### 1 Clean cnt and combine with fmli's ####

#### 1.0.1 Modify EIP II reported in November ####
# Get all rebates reported in November 
cnt_nov <- cnt %>% filter(RYYMM==2011) %>% 
  mutate(ID = substr(as.character(NEWID),1,6))

# Use family files to get family size 
fmli204n211 <- bind_rows(fmli204,fmli211)

fmli204n211 <- fmli204n211 %>%
  select(NEWID,YYMM,FAM_SIZE)

cnt_nov <- merge(cnt_nov,fmli204n211,by="NEWID")

# Adding up rebates reported in a same interview by a same CU

# One may contend that this step will complicate the cases where 
# A CU reports both EIP I and EIP II in Nov
# BUT these cases are quite rare. Can check by:
# cnt_nov_1 <- cnt_nov %>% group_by(NEWID) %>% filter( n() == 1)
# cnt_nov_2 <- cnt_nov %>% group_by(NEWID) %>% filter( n() == 2)

cnt_nov <- cnt_nov %>% group_by(NEWID) %>%
  mutate(EIP=sum(CONTEXPX)) %>% 
  distinct(NEWID,.keep_all = TRUE)

# Change a Nov rebate to a Dec rebate if it is no greater than family size times 600
# Of course, one might be concerned about small amounts due to phase-out
# But there is only one nov rebate that is less than 600
# And we maintain that CUs who are subject to phase-out, meaning CUs with high income --
# are unlikely to receive EIP I this late
cnt_nov_dec <- cnt_nov %>% filter(EIP <= FAM_SIZE * 600)

# Keep Nov rebates in Nov if payment is greater than family size times 600
cnt_nov_nov <- cnt_nov %>% filter(EIP > FAM_SIZE * 600)

# List of NEWIDs that requires NEWID adjustment
# Note that for these NEWIDs 1) interviewed in Dec and 2) with Nov rebates moved to Dec
# The rebate is in fact in the reference period of the next interview
# Hence, in the next step, for these rebates, we add the corresponding NEWID by 1 
cnt_nov_dec_add <- cnt_nov_dec %>% 
  filter(YYMM==2012)

# Adjust the receipt month and NEWID accordingly 
cnt <- cnt %>% 
  mutate(RYYMM = ifelse((NEWID %in% cnt_nov_nov$NEWID) & RYYMM == 2011, 2011, 
                        ifelse((NEWID %in% cnt_nov_dec$NEWID) & RYYMM == 2011, 2012, RYYMM)),
         NEWID = ifelse(NEWID %in% cnt_nov_dec_add$NEWID & RYYMM==2012, as.character(as.numeric(NEWID)+1), NEWID))

#### 1.0.2 Modify EIP II reported in February  ####

# Get all February rebates 
cnt_feb <- cnt %>% filter(RYYMM==2102) %>% 
  mutate(ID = substr(as.character(NEWID),1,6))

# Get family size 
fmli211n2 <- bind_rows(fmli211,fmli212)

fmli211n2 <- fmli211n2 %>%
  select(NEWID,YYMM,FAM_SIZE)

cnt_feb <- merge(cnt_feb,fmli211n2,by="NEWID")

# Adding up rebates reported in a same interview 

# One may contend that this step will complicate the cases where 
# A CU reports both EIP II and EIP III in Feb 
# Might be true for cases like 4410314 and 4425874
# BUT these cases are quite rare. Can check by:
cnt_feb_1 <- cnt_feb %>% group_by(NEWID) %>% filter( n() == 1)
cnt_feb_2 <- cnt_feb %>% group_by(NEWID) %>% filter( n() == 2)
cnt_feb_3 <- cnt_feb %>% group_by(NEWID) %>% filter( n() == 3)
cnt_feb_4 <- cnt_feb %>% group_by(NEWID) %>% filter( n() == 4)
cnt_feb_5 <- cnt_feb %>% group_by(NEWID) %>% filter( n() >= 5)

cnt_feb <- cnt_feb %>% group_by(NEWID) %>%
  mutate(EIP=sum(CONTEXPX)) %>% 
  distinct(NEWID,.keep_all = TRUE)

# Adjust a Feb rebate to a March rebate if it is greater than family size times 600
cnt_feb_mar_1 <- cnt_feb %>% filter(EIP > FAM_SIZE * 600)

# Get Feb rebates that are less than or equal to family size times 600
cnt_feb_rest <- cnt_feb %>% filter(EIP <= FAM_SIZE * 600)

# Move a Feb rebate to March if they are multiples of 1400
# Note: Highest EIP amount in cnt_feb_rest in 5400
cnt_feb_mar_2 <- cnt_feb_rest %>% 
  filter(EIP == 1400 | EIP == 1400*2 | EIP == 1400*3 | EIP == 1400*4 | EIP == 1400*5 |
           EIP == 1400*6 | EIP == 1400*7 | EIP == 1400*8 | EIP == 1400*9 | EIP == 1400*10)

# Those are kept in Feb
cnt_feb_feb <- cnt_feb_rest %>% filter(!(ID %in% cnt_feb_mar_2$ID))
# Those are moved to March
cnt_feb_mar <- bind_rows(cnt_feb_mar_1, cnt_feb_mar_2)

# List of NEWIDs that requires NEWID adjustment
# Note that for these NEWIDs 1) interviewed in Mar 2) with Feb rebates moved to Mar
# The rebate is in fact in the reference period of the next interview
# Hence, in the next step, for these rebates, we add the corresponding NEWID by 1 
cnt_feb_mar_add <- cnt_feb_mar %>% 
  filter(YYMM==2103)

# Adjust the receipt month nad NEWID accordingly 
cnt <- cnt %>% mutate(
  RYYMM = ifelse((NEWID %in% cnt_feb_feb$NEWID) & RYYMM == 2102, 2102, 
                 ifelse((NEWID %in% cnt_feb_mar$NEWID) & RYYMM == 2102, 2103, RYYMM)),
  NEWID = ifelse(NEWID %in% cnt_feb_mar_add$NEWID & RYYMM==2103, as.character(as.numeric(NEWID)+1), NEWID))

#### 1.1 Find all rebates ####
# Those who do not receive EIP has variable EIP = 0
cnt <- cnt %>% 
  mutate(EIP = CONTEXPX)

# Get all rebates
cnt_rc <- cnt %>% filter(EIP != 0) %>% select(NEWID,RYYMM,CHCKEFT,REBTUSED,EIP) 

#### 1.2 Modify missing values for disbursement method and usage ####
# Keep missing disbursement method as missing
# (Do nothing here)

# Adjust so that missing usage is the same as the reported usage for the other 
# EIP(s) received in the same reference period, if the CU has more than one EIP

# Studying interviews with different numbers of rebates
cnt_rc_1 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 1)
cnt_rc_2 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 2)
cnt_rc_3 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 3)
cnt_rc_4 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 4)
cnt_rc_5 <- cnt_rc %>% group_by(NEWID) %>% filter( n() >= 5)

# For interviews with four rebates
# Only 4614702 needs adjustment, both missing REBTUSED should be 1 given the other is 1
# We won't adjust CU 4503184 since the other EIPs have different uses
cnt_rc_4 <- cnt_rc_4 %>% 
  mutate(REBTUSED = ifelse(NEWID==4614702, 1, REBTUSED))

# For interviews with three rebates 
# Apply the similar logic
cnt_rc_3 <- cnt_rc_3 %>% 
  mutate(REBTUSED = ifelse(NEWID==4560502 | NEWID == 4514244 | NEWID == 4560503|
                             NEWID == 4655872, 1,
                           ifelse(NEWID==4652902, 3, REBTUSED)))

# For interviews with only two rebates
# One can also achieve this by merge, but loop is more straightforward and easy to run here
for (i in 1:(nrow(cnt_rc_2)-1)){
  if(cnt_rc_2$NEWID[i]==cnt_rc_2$NEWID[i+1] & is.na(cnt_rc_2$REBTUSED[i]) & !is.na(cnt_rc_2$REBTUSED[i+1])){
    cnt_rc_2$REBTUSED[i] = cnt_rc_2$REBTUSED[i+1]
  }
  else if(cnt_rc_2$NEWID[i]==cnt_rc_2$NEWID[i+1] & is.na(cnt_rc_2$REBTUSED[i+1]) & !is.na(cnt_rc_2$REBTUSED[i])){
    cnt_rc_2$REBTUSED[i+1] = cnt_rc_2$REBTUSED[i]
  }
}
# In fact, nothing needs to be changed here

# Merge back 
cnt_rc <- bind_rows(cnt_rc_1,cnt_rc_2,cnt_rc_3,cnt_rc_4,cnt_rc_5)

cnt_rc <- cnt_rc %>% arrange(NEWID)

write.csv(cnt_rc,"cnt_cleaned.csv")

#### 1.3 Create rebate variables at the interview level ####

# Create EIP by type at the rebate level
cnt_rc <- cnt_rc %>% 
  mutate(         
        # EIP I, II, and III
         EIPI = ifelse((RYYMM==2004|RYYMM==2005|RYYMM==2006|RYYMM==2007|
                                   RYYMM==2008|RYYMM==2009|RYYMM==2010|RYYMM==2011),EIP,0),
         EIPII = ifelse((RYYMM==2012|RYYMM==2101|RYYMM==2102),EIP,0),
         EIPIII = ifelse((RYYMM==2103|RYYMM==2104|RYYMM==2105|
                                     RYYMM==2106|RYYMM==2107|RYYMM==2108),EIP,0),
         
         # EIPI by disbursement method and use
         EIPI_ck = ifelse(CHCKEFT==1|EIPI==0, EIPI, 0), # by check
         EIPI_dd = ifelse(CHCKEFT==2|EIPI==0, EIPI, 0), # by direct deposit
         EIPI_dc = ifelse(CHCKEFT==3|EIPI==0, EIPI, 0), # by debit card 
         EIPI_ep = ifelse(REBTUSED==1|EIPI==0, EIPI, 0), # for expenses 
         EIPI_debt = ifelse(REBTUSED==2|EIPI==0, EIPI, 0), # for debt
         EIPI_sv = ifelse(REBTUSED==3|EIPI==0, EIPI, 0),  # for savings
         
         # EIPII by disbursement method and use
         EIPII_ck = ifelse(CHCKEFT==1|EIPII==0, EIPII, 0),
         EIPII_dd = ifelse(CHCKEFT==2|EIPII==0, EIPII, 0),
         EIPII_dc = ifelse(CHCKEFT==3|EIPII==0, EIPII, 0),
         EIPII_ep = ifelse(REBTUSED==1|EIPII==0, EIPII, 0),
         EIPII_debt = ifelse(REBTUSED==2|EIPII==0, EIPII, 0),
         EIPII_sv = ifelse(REBTUSED==3|EIPII==0, EIPII, 0), 
         
         # EIPIII by disbursement method and use
         EIPIII_ck = ifelse(CHCKEFT==1|EIPIII==0, EIPIII, 0),
         EIPIII_dd = ifelse(CHCKEFT==2|EIPIII==0, EIPIII, 0),
         EIPIII_dc = ifelse(CHCKEFT==3|EIPIII==0, EIPIII, 0),
         EIPIII_ep = ifelse(REBTUSED==1|EIPIII==0, EIPIII, 0),
         EIPIII_debt = ifelse(REBTUSED==2|EIPIII==0, EIPIII, 0),
         EIPIII_sv = ifelse(REBTUSED==3|EIPIII==0, EIPIII, 0),         
         
         # EIP I by month
         EIPI_Apr = ifelse(RYYMM==2004, EIPI, 0),
         EIPI_May = ifelse(RYYMM==2005, EIPI, 0),
         EIPI_Jun = ifelse(RYYMM==2006, EIPI, 0),
         EIPI_Jul = ifelse(RYYMM==2007, EIPI, 0),
         EIPI_Aug = ifelse(RYYMM==2008, EIPI, 0),
         EIPI_Sep = ifelse(RYYMM==2009, EIPI, 0),
         EIPI_Oct = ifelse(RYYMM==2010, EIPI, 0),
         EIPI_Nov = ifelse(RYYMM==2011, EIPI, 0),
         
         # EIP II by month
         EIPII_Dec = ifelse(RYYMM==2012, EIPII, 0),
         EIPII_Jan = ifelse(RYYMM==2101, EIPII, 0),
         EIPII_Feb = ifelse(RYYMM==2102, EIPII, 0),
         
         # EIP III by month
         EIPIII_Mar = ifelse(RYYMM==2103, EIPIII, 0),
         EIPIII_Apr = ifelse(RYYMM==2104, EIPIII, 0),
         EIPIII_May = ifelse(RYYMM==2105, EIPIII, 0),
         EIPIII_Jun = ifelse(RYYMM==2106, EIPIII, 0),
         EIPIII_Jul = ifelse(RYYMM==2107, EIPIII, 0),
         EIPIII_Aug = ifelse(RYYMM==2108, EIPIII, 0))

# Just to see how many interviews reports more than 1 rebates
cnt_rc_1 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 1)
cnt_rc_2 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 2)
cnt_rc_3 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 3)
cnt_rc_4 <- cnt_rc %>% group_by(NEWID) %>% filter( n() == 4)
cnt_rc_5 <- cnt_rc %>% group_by(NEWID) %>% filter( n() >= 5)

# Merge all EIPs received by a household in the same reference period, i.e. group by NEWID 
cnt_rc <- cnt_rc %>%
  group_by(NEWID) %>%
  mutate(EIPI_t=sum(EIPI),EIPI_by_ck_t=sum(EIPI_ck),EIPI_by_dd_t=sum(EIPI_dd),EIPI_by_dc_t=sum(EIPI_dc),
         EIPI_for_ep_t=sum(EIPI_ep),EIPI_for_debt_t=sum(EIPI_debt),EIPI_for_sv_t=sum(EIPI_sv),
         
         EIPII_t=sum(EIPII),EIPII_by_ck_t=sum(EIPII_ck),EIPII_by_dd_t=sum(EIPII_dd),EIPII_by_dc_t=sum(EIPII_dc),
         EIPII_for_ep_t=sum(EIPII_ep),EIPII_for_debt_t=sum(EIPII_debt),EIPII_for_sv_t=sum(EIPII_sv),

         EIPIII_t=sum(EIPIII),EIPIII_by_ck_t=sum(EIPIII_ck),EIPIII_by_dd_t=sum(EIPIII_dd),EIPIII_by_dc_t=sum(EIPIII_dc), 
         EIPIII_for_ep_t=sum(EIPIII_ep),EIPIII_for_debt_t=sum(EIPIII_debt),EIPIII_for_sv_t=sum(EIPIII_sv),
         EIPIII_t_count = n(),

         EIPI_apr_t=sum(EIPI_Apr),EIPI_may_t=sum(EIPI_May),EIPI_jun_t=sum(EIPI_Jun),EIPI_jul_t=sum(EIPI_Jul),
         EIPI_aug_t=sum(EIPI_Aug),EIPI_sep_t=sum(EIPI_Sep),EIPI_oct_t=sum(EIPI_Oct),EIPI_nov_t=sum(EIPI_Nov), 
         EIPII_dec_t=sum(EIPII_Dec),EIPII_jan_t=sum(EIPII_Jan),EIPII_feb_t=sum(EIPII_Feb), 
         EIPIII_mar_t=sum(EIPIII_Mar),EIPIII_apr_t=sum(EIPIII_Apr),EIPIII_may_t=sum(EIPIII_May),EIPIII_jun_t=sum(EIPIII_Jun),
         EIPIII_jul_t=sum(EIPIII_Jul),EIPIII_aug_t=sum(EIPIII_Aug))%>% 
  ungroup()

# Create indicator variables
# cnt_rc <- cnt_rc %>% 
#   mutate(iEIPI_t = ifelse(EIPI_t>0,1,0),
#          iEIPI_by_ck_t = ifelse(EIPI_by_ck_t>0,1,0),
#          iEIPI_by_dd_t = ifelse(EIPI_by_dd_t>0,1,0),
#          iEIPI_by_dc_t = ifelse(EIPI_by_dc_t>0,1,0),
#          iEIPI_for_ep_t = ifelse(EIPI_for_ep_t>0,1,0),
#          iEIPI_for_debt_t = ifelse(EIPI_for_debt_t>0,1,0),
#          iEIPI_for_sv_t = ifelse(EIPI_for_sv_t>0,1,0), 
#          
#          iEIPII_t = ifelse(EIPII_t>0,1,0),
#          iEIPII_by_ck_t = ifelse(EIPII_by_ck_t>0,1,0),
#          iEIPII_by_dd_t = ifelse(EIPII_by_dd_t>0,1,0),
#          iEIPII_by_dc_t = ifelse(EIPII_by_dc_t>0,1,0),
#          iEIPII_for_ep_t = ifelse(EIPII_for_ep_t>0,1,0),
#          iEIPII_for_debt_t = ifelse(EIPII_for_debt_t>0,1,0),
#          iEIPII_for_sv_t = ifelse(EIPII_for_sv_t>0,1,0), 
#          
#          iEIPIII_t = ifelse(EIPIII_t>0,1,0),
#          iEIPIII_by_ck_t = ifelse(EIPIII_by_ck_t>0,1,0),
#          iEIPIII_by_dd_t = ifelse(EIPIII_by_dd_t>0,1,0),
#          iEIPIII_by_dc_t = ifelse(EIPIII_by_dc_t>0,1,0),
#          iEIPIII_for_ep_t = ifelse(EIPIII_for_ep_t>0,1,0),
#          iEIPIII_for_debt_t = ifelse(EIPIII_for_debt_t>0,1,0),
#          iEIPIII_for_sv_t = ifelse(EIPIII_for_sv_t>0,1,0))

# Drop pre-aggregated EIP variables
# Drop repetitions created by group_by() and mutate()
# cnt_rc now contains all EIP information aggregated to the interview level
cnt_rc <- cnt_rc %>% select(-c(2:43)) %>% 
  distinct(NEWID,.keep_all = TRUE) 

#### 1.4 Identify different types of CUs  ####

# Create CU ID by dropping the last digit of NEWID (interview number)
lists_base <- cnt_rc 
lists_base$ID <- as.character(lists_base$NEWID)
lists_base$ID <- substr(lists_base$NEWID,1,nchar(lists_base$NEWID)-1)

# Aggregate all EIPIIIs received by a CU
lists_base <- lists_base %>%
  group_by(ID) %>%
  mutate(TEIPIII=sum(EIPIII_t),
         TEIPIII_by_ck=sum(EIPIII_by_ck_t),TEIPIII_by_dd=sum(EIPIII_by_dd_t),TEIPIII_by_dc=sum(EIPIII_by_dc_t),
         TEIPIII_for_ep=sum(EIPIII_for_ep_t),TEIPIII_for_debt=sum(EIPIII_for_debt_t),TEIPIII_for_sv=sum(EIPIII_for_sv_t)) %>% 
  ungroup() %>% 
  distinct(ID,.keep_all = TRUE)

# CUs that are recipients 
list_r <- lists_base %>% filter(TEIPIII >0) %>%
  select("ID") 
# CUs that receive at least one EIPIII by check 
list_ck <- lists_base %>% filter(TEIPIII_by_ck >0) %>%
  select("ID")
# CUs that receive at least one EIPIII by direct deposit
list_dd <- lists_base %>% filter(TEIPIII_by_dd >0) %>%
  select("ID")
# CUs that receive at least one EIPIII by debit card
list_dc <- lists_base %>% filter(TEIPIII_by_dc >0) %>%
  select("ID")
# CUs that use at least one EIPIII "mostly for expense"
list_ep <- lists_base %>% filter(TEIPIII_for_ep >0) %>%
  select("ID")
# CUs that use at least one EIPIII "mostly to pay off debt"
list_debt <- lists_base %>% filter(TEIPIII_for_debt >0) %>%
  select("ID")
# CUs that use at least one EIPIII "mostly to increase savings"
list_sv <- lists_base %>% filter(TEIPIII_for_sv >0) %>%
  select("ID")

#### 1.5 Find all interview without rebates ####
fmli <- bind_rows(fmli203,fmli204,
                  fmli211,fmli212,fmli213)

fmli_flt <- fmli %>% 
  select (NEWID) %>% 
  mutate(
    # Not exactly true for May 2020 interviews
    # but are taken care of later in the function df_modifier()
         EIPI_t = 0,
         EIPI_by_ck_t = 0,
         EIPI_by_dd_t = 0,
         EIPI_by_dc_t = 0,
         EIPI_for_ep_t = 0,
         EIPI_for_debt_t = 0,
         EIPI_for_sv_t = 0,
         
         EIPII_t = 0,
         EIPII_by_ck_t = 0,
         EIPII_by_dd_t = 0,
         EIPII_by_dc_t = 0,
         EIPII_for_ep_t = 0,
         EIPII_for_debt_t = 0,
         EIPII_for_sv_t = 0,   
         
         EIPIII_t = 0,
         EIPIII_t_count = 0,
         EIPIII_by_ck_t = 0,
         EIPIII_by_dd_t = 0,
         EIPIII_by_dc_t = 0,
         EIPIII_for_ep_t = 0,
         EIPIII_for_debt_t = 0,
         EIPIII_for_sv_t = 0,
         
         EIPI_apr_t= 0,
         EIPI_may_t= 0,
         EIPI_jun_t= 0,
         EIPI_jul_t= 0,
         EIPI_aug_t= 0,
         EIPI_sep_t= 0,
         EIPI_oct_t= 0,
         EIPI_nov_t= 0, 
         EIPII_dec_t= 0,
         EIPII_jan_t= 0,
         EIPII_feb_t= 0, 
         EIPIII_mar_t= 0,
         EIPIII_apr_t= 0,
         EIPIII_may_t= 0,
         EIPIII_jun_t= 0,
         EIPIII_jul_t= 0,
         EIPIII_aug_t= 0,
         
         # iEIPI_t = 0,
         # iEIPI_by_ck_t = 0,
         # iEIPI_by_dd_t = 0,
         # iEIPI_by_dc_t = 0,
         # iEIPI_for_ep_t = 0,
         # iEIPI_for_debt_t = 0,
         # iEIPI_for_sv_t = 0,
         # 
         # iEIPII_t = 0,
         # iEIPII_by_ck_t = 0,
         # iEIPII_by_dd_t = 0,
         # iEIPII_by_dc_t = 0,
         # iEIPII_for_ep_t = 0,
         # iEIPII_for_debt_t = 0,
         # iEIPII_for_sv_t = 0,
         # 
         # iEIPIII_t = 0,
         # iEIPIII_by_ck_t = 0,
         # iEIPIII_by_dd_t = 0,
         # iEIPIII_by_dc_t = 0,
         # iEIPIII_for_ep_t = 0,
         # iEIPIII_for_debt_t = 0,
         # iEIPIII_for_sv_t = 0
         
         # Note: not exactly true for May 2020 interviews
         # but May interviews will not be included anyways
         
         )

# cnt20_nr contains all interviews without rebates reported 
cnt_nr <- fmli_flt %>% filter(!(NEWID %in% cnt_rc$NEWID))

#### 1.6 Create basic df with all rebate information  ####
# Merge to form a cnt_f that contains all information about rebates
cnt_f <- rbind(cnt_rc,cnt_nr)
# Merge df with fmli (fmli203 to fmli213)
# df now contains all EIP information as well as other info already in fmli
df <- merge(fmli,cnt_f,by="NEWID")

### 2  Panel without first difference ####
# df_modifier keeps only CU with an interview in Apr, May, or Jun 2021 in the sample
# It then select CE variables in df that are relevant to the study 
# Finally, it constructs variables used in our study
# including demographics, expenditure, lagged EIPs, and group dummies

df_modifier <- function(df){
  
  #### Create CU ID ####
  # Drop last digit of NEWID
  df <- df %>% mutate(
    ID = substr(as.character(NEWID),1,6))
  
  #### Selection of CUs ####
  # Keep only CUs that meet the basic criterion to stay in the sample
  apr_list <- df %>% filter(YYMM==2104) %>% select(ID)
  may_list <- df %>% filter(YYMM==2105) %>% select(ID)
  jun_list <- df %>% filter(YYMM==2106) %>% select(ID)
  
  df <- df %>% 
    filter(ID %in% apr_list$ID |ID %in% may_list$ID |ID %in% jun_list$ID) 
  
  #### Selecting relevant CE variables #### 
  df <- df %>% select(
    # interview info
    ID, NEWID, YYMM, INTERI,
    
    # Demographics
    PERSLT18, FAM_SIZE, AGE_REF, AGE2, SEX_REF,MARITAL1,CUTENURE,
    
    # EIPs
    EIPI_t, EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t, EIPI_for_ep_t,
    EIPI_for_debt_t, EIPI_for_sv_t,
    
    EIPII_t, EIPII_by_ck_t, EIPII_by_dd_t, EIPII_by_dc_t, EIPII_for_ep_t,
    EIPII_for_debt_t, EIPII_for_sv_t,   
    
    EIPIII_t, EIPIII_t_count, EIPIII_by_ck_t, EIPIII_by_dd_t, EIPIII_by_dc_t, EIPIII_for_ep_t,
    EIPIII_for_debt_t, EIPIII_for_sv_t,
    
    EIPI_apr_t, EIPI_may_t, EIPI_jun_t, EIPI_jul_t, EIPI_aug_t, EIPI_sep_t, EIPI_oct_t,
    EIPI_nov_t, EIPII_dec_t, EIPII_jan_t, EIPII_feb_t,  EIPIII_mar_t, EIPIII_apr_t, EIPIII_may_t,
    EIPIII_jun_t, EIPIII_jul_t, EIPIII_aug_t,
    
    # iEIPI_t, iEIPI_by_ck_t, iEIPI_by_dd_t, iEIPI_by_dc_t, 
    # iEIPI_for_ep_t, iEIPI_for_debt_t, iEIPI_for_sv_t,
    # 
    # iEIPII_t, iEIPII_by_ck_t, iEIPII_by_dd_t, iEIPII_by_dc_t, 
    # iEIPII_for_ep_t, iEIPII_for_debt_t, iEIPII_for_sv_t,
    # 
    # iEIPIII_t, iEIPIII_by_ck_t, iEIPIII_by_dd_t, iEIPIII_by_dc_t,
    # iEIPIII_for_ep_t, iEIPIII_for_debt_t, iEIPIII_for_sv_t,
    
    # Food expenditure
    FDAWAYCQ, FDAWAYPQ, FDHOMECQ,FDHOMEPQ,
    FOODCQ, FOODPQ, ALCBEVCQ, ALCBEVPQ, 
    
    # Strict non-durable expenditure
    UTILCQ, UTILPQ, HOUSOPCQ, HOUSOPPQ,
    PUBTRACQ, PUBTRAPQ, GASMOCQ, GASMOPQ,
    PERSCACQ, PERSCAPQ, TOBACCCQ, TOBACCPQ,
    MISCCQ, MISCPQ, 
    
    # Non-durables expenditure
    APPARCQ, APPARPQ,
    HEALTHCQ, HEALTHPQ, READCQ, READPQ,
    
    # Total expenditure
    TOTEXPCQ, TOTEXPPQ, 
    HOUSCQ, HOUSPQ, EDUCACQ, EDUCAPQ,
    ENTERTCQ, ENTERTPQ, TRANSCQ, TRANSPQ,
    CASHCOCQ, CASHCOPQ) %>% 
    
    #### Create demographic and expenditure variables studied ####
    mutate(
      
      # Demographics
      NUM_KIDS = PERSLT18,
      NUM_ADTS = FAM_SIZE - PERSLT18,
      AGE_AVG = ifelse(is.na(AGE2),AGE_REF,(AGE_REF + AGE2)/2),
      
      # Four big expenditure categories
      EX_FD = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ,
      
      EX_SN = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ +
        UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ +
        PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ +
        PERSCACQ + PERSCAPQ + TOBACCCQ + TOBACCPQ +
        MISCCQ + MISCPQ,
      
      EX_N = EX_SN +
        APPARCQ + APPARPQ + HEALTHCQ + HEALTHPQ +
        READCQ + READPQ,
      
      EX_T = TOTEXPCQ + TOTEXPPQ,
      
      # Sub-categories - food
      EX_FD_HM = FDHOMECQ + FDHOMEPQ, # food at home
      EX_FD_AW = FDAWAYCQ + FDAWAYPQ, # food away from home 
      EX_ALC = ALCBEVCQ + ALCBEVPQ, # alcholic beverages 
      
      # Sub-categories - Strict non-durables
      EX_UT_HO = UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ, # utility and household operations 
      EX_PC_MIS = PERSCACQ + PERSCAPQ + MISCCQ + MISCPQ, # personal care and miscellaneous 
      EX_TR_GAS = PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ, # public transportation, gas, and motor oil
      EX_TBC = TOBACCCQ + TOBACCPQ, # tobacco
      
      # Sub-categories - Non-durables 
      EX_APR = APPARCQ + APPARPQ, # apparel 
      EX_HLT = HEALTHCQ + HEALTHPQ, # health
      EX_READ = READCQ + READPQ, # reading materials
      
      # Sub-categories - Total
      EX_HS = HOUSCQ + HOUSPQ, # housing 
      EX_EDU = EDUCACQ + EDUCAPQ, # education 
      EX_ENT = ENTERTCQ + ENTERTPQ, # entertainment 
      EX_TRANS = TRANSCQ + TRANSPQ, # transportation
      EX_CACT = CASHCOCQ + CASHCOPQ # cash contributions
    ) %>% 
    
    # Make the May rebates NA since the questions are not asked 
    # mutate(
    #   EIPI_t = ifelse(YYMM==2005,NA,EIPI_t), 
    #   EIPI_by_ck_t = ifelse(YYMM==2005,NA,EIPI_by_ck_t), 
    #   EIPI_by_dd_t = ifelse(YYMM==2005,NA,EIPI_by_dd_t), 
    #   EIPI_by_dc_t = ifelse(YYMM==2005,NA,EIPI_by_dc_t), 
    #   EIPI_for_ep_t = ifelse(YYMM==2005,NA,EIPI_for_ep_t),
    #   EIPI_for_debt_t = ifelse(YYMM==2005,NA,EIPI_for_debt_t), 
    #   EIPI_for_sv_t = ifelse(YYMM==2005,NA,EIPI_for_sv_t),
    #   
    #   EIPI_apr_t = ifelse(YYMM==2005,NA,EIPI_apr_t),
    #   EIPI_may_t = ifelse(YYMM==2005,NA,EIPI_may_t),
    #   EIPI_jun_t = ifelse(YYMM==2005,NA,EIPI_jun_t),
    #   EIPI_jul_t = ifelse(YYMM==2005,NA,EIPI_jul_t),
    #   EIPI_aug_t = ifelse(YYMM==2005,NA,EIPI_aug_t),
    #   EIPI_sep_t = ifelse(YYMM==2005,NA,EIPI_sep_t),
    #   EIPI_oct_t = ifelse(YYMM==2005,NA,EIPI_oct_t),
    #   EIPI_nov_t = ifelse(YYMM==2005,NA,EIPI_nov_t)
    #   ) %>%
    
    #### Create baseline lagged EIP variables ####
    # This covers most of the cases, but some adjustments are needed 
    group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    mutate(
      # Lag EIP I
      # The possible month of first interview range from Jul 2020 to June 2021
      # The first lag of the majority of these CUs during the first interview is unknown 
      # Exception (first interview July 2020) is addressed later
      EIPI_tm1 = lag(EIPI_t, n=1, default=NA),
      EIPI_by_ck_tm1 = lag(EIPI_by_ck_t, n=1, default=NA),
      EIPI_by_dd_tm1 = lag(EIPI_by_dd_t, n=1, default=NA),
      EIPI_by_dc_tm1 = lag(EIPI_by_dc_t, n=1, default=NA),
      EIPI_for_ep_tm1 = lag(EIPI_for_ep_t, n=1, default=NA),
      EIPI_for_debt_tm1 = lag(EIPI_for_debt_t, n=1, default=NA),
      EIPI_for_sv_tm1 = lag(EIPI_for_sv_t, n=1, default=NA),
      # Second lag EIP I
      # Similar to above, but less NAs, more 0s
      # Os are addressed later
      EIPI_tm2 = lag(EIPI_t, n=2, default=NA),
      EIPI_by_ck_tm2 = lag(EIPI_by_ck_t, n=2, default=NA),
      EIPI_by_dd_tm2 = lag(EIPI_by_dd_t, n=2, default=NA),
      EIPI_by_dc_tm2 = lag(EIPI_by_dc_t, n=2, default=NA),
      EIPI_for_ep_tm2 = lag(EIPI_for_ep_t, n=2, default=NA),
      EIPI_for_debt_tm2 = lag(EIPI_for_debt_t, n=2, default=NA),
      EIPI_for_sv_tm2 = lag(EIPI_for_sv_t, n=2, default=NA),
      # Third lag EIP I
      # Similar to above, even less NAs, more 0s
      # Os are addressed later
      EIPI_tm3 = lag(EIPI_t, n=3, default=NA),
      EIPI_by_ck_tm3 = lag(EIPI_by_ck_t, n=3, default=NA),
      EIPI_by_dd_tm3 = lag(EIPI_by_dd_t, n=3, default=NA),
      EIPI_by_dc_tm3 = lag(EIPI_by_dc_t, n=3, default=NA),
      EIPI_for_ep_tm3 = lag(EIPI_for_ep_t, n=3, default=NA),
      EIPI_for_debt_tm3 = lag(EIPI_for_debt_t, n=3, default=NA),
      EIPI_for_sv_tm3 = lag(EIPI_for_sv_t, n=3, default=NA),
      
      # Lag EIP II
      # The possible month of first interview range from Jul 2020 to June 2021
      # Except for May and June 2021, none of these interviews can potentially have lagged EIP II
      # Exceptions are addressed later 
      EIPII_tm1 = lag(EIPII_t, n=1, default=0),
      EIPII_by_ck_tm1 = lag(EIPII_by_ck_t, n=1, default=0),
      EIPII_by_dd_tm1 = lag(EIPII_by_dd_t, n=1, default=0),
      EIPII_by_dc_tm1 = lag(EIPII_by_dc_t, n=1, default=0),
      EIPII_for_ep_tm1 = lag(EIPII_for_ep_t, n=1, default=0),
      EIPII_for_debt_tm1 = lag(EIPII_for_debt_t, n=1, default=0),
      EIPII_for_sv_tm1 = lag(EIPII_for_sv_t, n=1, default=0),
      # Second lag EIP II
      # The possible month of first interview range from Jul 2020 to June 2021
      # none of these interviews can potentially have second lagged EIP II
      EIPII_tm2 = lag(EIPII_t, n=2, default=0),
      EIPII_by_ck_tm2 = lag(EIPII_by_ck_t, n=2, default=0),
      EIPII_by_dd_tm2 = lag(EIPII_by_dd_t, n=2, default=0),
      EIPII_by_dc_tm2 = lag(EIPII_by_dc_t, n=2, default=0),
      EIPII_for_ep_tm2 = lag(EIPII_for_ep_t, n=2, default=0),
      EIPII_for_debt_tm2 = lag(EIPII_for_debt_t, n=2, default=0),
      EIPII_for_sv_tm2 = lag(EIPII_for_sv_t, n=2, default=0),
      # Third lag EIP II
      # The possible month of first interview range from Jul 2020 to June 2021
      # none of these interviews can potentially have third lagged EIP II
      EIPII_tm3 = lag(EIPII_t, n=3, default=0),
      EIPII_by_ck_tm3 = lag(EIPII_by_ck_t, n=3, default=0),
      EIPII_by_dd_tm3 = lag(EIPII_by_dd_t, n=3, default=0),
      EIPII_by_dc_tm3 = lag(EIPII_by_dc_t, n=3, default=0),
      EIPII_for_ep_tm3 = lag(EIPII_for_ep_t, n=3, default=0),
      EIPII_for_debt_tm3 = lag(EIPII_for_debt_t, n=3, default=0),
      EIPII_for_sv_tm3 = lag(EIPII_for_sv_t, n=3, default=0),
      
      # Lag EIP III
      # The possible month of first interview range from Jul 2020 to June 2021
      # none of these interviews can potentially have lagged EIP III
      EIPIII_tm1 = lag(EIPIII_t, n=1, default=0),
      EIPIII_by_ck_tm1 = lag(EIPIII_by_ck_t, n=1, default=0),
      EIPIII_by_dd_tm1 = lag(EIPIII_by_dd_t, n=1, default=0),
      EIPIII_by_dc_tm1 = lag(EIPIII_by_dc_t, n=1, default=0),
      EIPIII_for_ep_tm1 = lag(EIPIII_for_ep_t, n=1, default=0),
      EIPIII_for_debt_tm1 = lag(EIPIII_for_debt_t, n=1, default=0),
      EIPIII_for_sv_tm1 = lag(EIPIII_for_sv_t, n=1, default=0),
      # Second lag EIP III
      # Similar to above 
      EIPIII_tm2 = lag(EIPIII_t, n=2, default=0),
      EIPIII_by_ck_tm2 = lag(EIPIII_by_ck_t, n=2, default=0),
      EIPIII_by_dd_tm2 = lag(EIPIII_by_dd_t, n=2, default=0),
      EIPIII_by_dc_tm2 = lag(EIPIII_by_dc_t, n=2, default=0),
      EIPIII_for_ep_tm2 = lag(EIPIII_for_ep_t, n=2, default=0),
      EIPIII_for_debt_tm2 = lag(EIPIII_for_debt_t, n=2, default=0),
      EIPIII_for_sv_tm2 = lag(EIPIII_for_sv_t, n=2, default=0),
      # Third lag EIP III
      # Similar to above 
      EIPIII_tm3 = lag(EIPIII_t, n=3, default=0),
      EIPIII_by_ck_tm3 = lag(EIPIII_by_ck_t, n=3, default=0),
      EIPIII_by_dd_tm3 = lag(EIPIII_by_dd_t, n=3, default=0),
      EIPIII_by_dc_tm3 = lag(EIPIII_by_dc_t, n=3, default=0),
      EIPIII_for_ep_tm3 = lag(EIPIII_for_ep_t, n=3, default=0),
      EIPIII_for_debt_tm3 = lag(EIPIII_for_debt_t, n=3, default=0),
      EIPIII_for_sv_tm3 = lag(EIPIII_for_sv_t, n=3, default=0),
      # Lag iEIPI
      # iEIPI_tm1 = lag(iEIPI_t, n=1, default=NA),
      # iEIPI_by_ck_tm1 = lag(iEIPI_by_ck_t, n=1, default=NA),
      # iEIPI_by_dd_tm1 = lag(iEIPI_by_dd_t, n=1, default=NA),
      # iEIPI_by_dc_tm1 = lag(iEIPI_by_dc_t, n=1, default=NA),
      # iEIPI_for_ep_tm1 = lag(iEIPI_for_ep_t, n=1, default=NA),
      # iEIPI_for_debt_tm1 = lag(iEIPI_for_debt_t, n=1, default=NA),
      # iEIPI_for_sv_tm1 = lag(iEIPI_for_sv_t, n=1, default=NA),
      # # Second lag iEIPI
      # iEIPI_tm2 = lag(iEIPI_t, n=2, default=NA),
      # iEIPI_by_ck_tm2 = lag(iEIPI_by_ck_t, n=2, default=NA),
      # iEIPI_by_dd_tm2 = lag(iEIPI_by_dd_t, n=2, default=NA),
      # iEIPI_by_dc_tm2 = lag(iEIPI_by_dc_t, n=2, default=NA),
      # iEIPI_for_ep_tm2 = lag(iEIPI_for_ep_t, n=2, default=NA),
      # iEIPI_for_debt_tm2 = lag(iEIPI_for_debt_t, n=2, default=NA),
      # iEIPI_for_sv_tm2 = lag(iEIPI_for_sv_t, n=2, default=NA),
      # # Third lag iEIPI
      # iEIPI_tm3 = lag(iEIPI_t, n=3, default=NA),
      # iEIPI_by_ck_tm3 = lag(iEIPI_by_ck_t, n=3, default=NA),
      # iEIPI_by_dd_tm3 = lag(iEIPI_by_dd_t, n=3, default=NA),
      # iEIPI_by_dc_tm3 = lag(iEIPI_by_dc_t, n=3, default=NA),
      # iEIPI_for_ep_tm3 = lag(iEIPI_for_ep_t, n=3, default=NA),
      # iEIPI_for_debt_tm3 = lag(iEIPI_for_debt_t, n=3, default=NA),
      # iEIPI_for_sv_tm3 = lag(iEIPI_for_sv_t, n=3, default=NA),
      # # Lag iEIPII
      # iEIPII_tm1 = lag(iEIPII_t, n=1, default=0),
      # iEIPII_by_ck_tm1 = lag(iEIPII_by_ck_t, n=1, default=0),
      # iEIPII_by_dd_tm1 = lag(iEIPII_by_dd_t, n=1, default=0),
      # iEIPII_by_dc_tm1 = lag(iEIPII_by_dc_t, n=1, default=0),
      # iEIPII_for_ep_tm1 = lag(iEIPII_for_ep_t, n=1, default=0),
      # iEIPII_for_debt_tm1 = lag(iEIPII_for_debt_t, n=1, default=0),
      # iEIPII_for_sv_tm1 = lag(iEIPII_for_sv_t, n=1, default=0),
      # # Second Lag iEIPII
      # iEIPII_tm2 = lag(iEIPII_t, n=2, default=0),
      # iEIPII_by_ck_tm2 = lag(iEIPII_by_ck_t, n=2, default=0),
      # iEIPII_by_dd_tm2 = lag(iEIPII_by_dd_t, n=2, default=0),
      # iEIPII_by_dc_tm2 = lag(iEIPII_by_dc_t, n=2, default=0),
      # iEIPII_for_ep_tm2 = lag(iEIPII_for_ep_t, n=2, default=0),
      # iEIPII_for_debt_tm2 = lag(iEIPII_for_debt_t, n=2, default=0),
      # iEIPII_for_sv_tm2 = lag(iEIPII_for_sv_t, n=2, default=0),
      # # Third lag iEIPII
      # iEIPII_tm3 = lag(iEIPII_t, n=3, default=0),
      # iEIPII_by_ck_tm3 = lag(iEIPII_by_ck_t, n=3, default=0),
      # iEIPII_by_dd_tm3 = lag(iEIPII_by_dd_t, n=3, default=0),
      # iEIPII_by_dc_tm3 = lag(iEIPII_by_dc_t, n=3, default=0),
      # iEIPII_for_ep_tm3 = lag(iEIPII_for_ep_t, n=3, default=0),
      # iEIPII_for_debt_tm3 = lag(iEIPII_for_debt_t, n=3, default=0),
      # iEIPII_for_sv_tm3 = lag(iEIPII_for_sv_t, n=3, default=0),
      # # Lag iEIPIII
      # iEIPIII_tm1 = lag(iEIPIII_t, n=1, default=0),
      # iEIPIII_by_ck_tm1 = lag(iEIPIII_by_ck_t, n=1, default=0),
      # iEIPIII_by_dd_tm1 = lag(iEIPIII_by_dd_t, n=1, default=0),
      # iEIPIII_by_dc_tm1 = lag(iEIPIII_by_dc_t, n=1, default=0),
      # iEIPIII_for_ep_tm1 = lag(iEIPIII_for_ep_t, n=1, default=0),
      # iEIPIII_for_debt_tm1 = lag(iEIPIII_for_debt_t, n=1, default=0),
      # iEIPIII_for_sv_tm1 = lag(iEIPIII_for_sv_t, n=1, default=0),
      # # Seocnd Lag iEIPIII
      # iEIPIII_tm2 = lag(iEIPIII_t, n=2, default=0),
      # iEIPIII_by_ck_tm2 = lag(iEIPIII_by_ck_t, n=2, default=0),
      # iEIPIII_by_dd_tm2 = lag(iEIPIII_by_dd_t, n=2, default=0),
      # iEIPIII_by_dc_tm2 = lag(iEIPIII_by_dc_t, n=2, default=0),
      # iEIPIII_for_ep_tm2 = lag(iEIPIII_for_ep_t, n=2, default=0),
      # iEIPIII_for_debt_tm2 = lag(iEIPIII_for_debt_t, n=2, default=0),
      # iEIPIII_for_sv_tm2 = lag(iEIPIII_for_sv_t, n=2, default=0),
      # # Third lag iEIPIII
      # iEIPIII_tm3 = lag(iEIPIII_t, n=3, default=0),
      # iEIPIII_by_ck_tm3 = lag(iEIPIII_by_ck_t, n=3, default=0),
      # iEIPIII_by_dd_tm3 = lag(iEIPIII_by_dd_t, n=3, default=0),
      # iEIPIII_by_dc_tm3 = lag(iEIPIII_by_dc_t, n=3, default=0),
      # iEIPIII_for_ep_tm3 = lag(iEIPIII_for_ep_t, n=3, default=0),
      # iEIPIII_for_debt_tm3 = lag(iEIPIII_for_debt_t, n=3, default=0),
      # iEIPIII_for_sv_tm3 = lag(iEIPIII_for_sv_t, n=3, default=0),
      # Lag EIP by month
      EIPI_apr_tm1 = lag(EIPI_apr_t, n=1, default=NA),
      EIPI_may_tm1 = lag(EIPI_may_t, n=1, default=NA),
      EIPI_jun_tm1 = lag(EIPI_jun_t, n=1, default=NA),
      EIPI_jul_tm1 = lag(EIPI_jul_t, n=1, default=NA),
      EIPI_aug_tm1 = lag(EIPI_aug_t, n=1, default=NA),
      EIPI_sep_tm1 = lag(EIPI_sep_t, n=1, default=NA),
      EIPI_oct_tm1 = lag(EIPI_oct_t, n=1, default=NA),
      EIPI_nov_tm1 = lag(EIPI_nov_t, n=1, default=NA),
      EIPII_dec_tm1 = lag(EIPII_dec_t, n=1, default=0),
      EIPII_jan_tm1 = lag(EIPII_jan_t, n=1, default=0),
      EIPII_feb_tm1 = lag(EIPII_feb_t, n=1, default=0),
      EIPIII_mar_tm1 = lag(EIPIII_mar_t, n=1, default=0),
      EIPIII_apr_tm1 = lag(EIPIII_apr_t, n=1, default=0),
      EIPIII_may_tm1 = lag(EIPIII_may_t, n=1, default=0),
      EIPIII_jun_tm1 = lag(EIPIII_jun_t, n=1, default=0),
      EIPIII_jul_tm1 = lag(EIPIII_jul_t, n=1, default=0),
      EIPIII_aug_tm1 = lag(EIPIII_aug_t, n=1, default=0),
      # Second lag EIP by month
      EIPI_apr_tm2 = lag(EIPI_apr_t, n=2, default=NA),
      EIPI_may_tm2 = lag(EIPI_may_t, n=2, default=NA),
      EIPI_jun_tm2 = lag(EIPI_jun_t, n=2, default=NA),
      EIPI_jul_tm2 = lag(EIPI_jul_t, n=2, default=NA),
      EIPI_aug_tm2 = lag(EIPI_aug_t, n=2, default=NA),
      EIPI_sep_tm2 = lag(EIPI_sep_t, n=2, default=NA),
      EIPI_oct_tm2 = lag(EIPI_oct_t, n=2, default=NA),
      EIPI_nov_tm2 = lag(EIPI_nov_t, n=2, default=NA),
      EIPII_dec_tm2 = lag(EIPII_dec_t, n=2, default=0),
      EIPII_jan_tm2 = lag(EIPII_jan_t, n=2, default=0),
      EIPII_feb_tm2 = lag(EIPII_feb_t, n=2, default=0),
      EIPIII_mar_tm2 = lag(EIPIII_mar_t, n=2, default=0),
      EIPIII_apr_tm2 = lag(EIPIII_apr_t, n=2, default=0),
      EIPIII_may_tm2 = lag(EIPIII_may_t, n=2, default=0),
      EIPIII_jun_tm2 = lag(EIPIII_jun_t, n=2, default=0),
      EIPIII_jul_tm2 = lag(EIPIII_jul_t, n=2, default=0),
      EIPIII_aug_tm2 = lag(EIPIII_aug_t, n=2, default=0),
      # Third lag EIP by month
      EIPI_apr_tm3 = lag(EIPI_apr_t, n=3, default=NA),
      EIPI_may_tm3 = lag(EIPI_may_t, n=3, default=NA),
      EIPI_jun_tm3 = lag(EIPI_jun_t, n=3, default=NA),
      EIPI_jul_tm3 = lag(EIPI_jul_t, n=3, default=NA),
      EIPI_aug_tm3 = lag(EIPI_aug_t, n=3, default=NA),
      EIPI_sep_tm3 = lag(EIPI_sep_t, n=3, default=NA),
      EIPI_oct_tm3 = lag(EIPI_oct_t, n=3, default=NA),
      EIPI_nov_tm3 = lag(EIPI_nov_t, n=3, default=NA),
      EIPII_dec_tm3 = lag(EIPII_dec_t, n=3, default=0),
      EIPII_jan_tm3 = lag(EIPII_jan_t, n=3, default=0),
      EIPII_feb_tm3 = lag(EIPII_feb_t, n=3, default=0),
      EIPIII_mar_tm3 = lag(EIPIII_mar_t, n=3, default=0),
      EIPIII_apr_tm3 = lag(EIPIII_apr_t, n=3, default=0),
      EIPIII_may_tm3 = lag(EIPIII_may_t, n=3, default=0),
      EIPIII_jun_tm3 = lag(EIPIII_jun_t, n=3, default=0),
      EIPIII_jul_tm3 = lag(EIPIII_jul_t, n=3, default=0),
      EIPIII_aug_tm3 = lag(EIPIII_aug_t, n=3, default=0))
  
  # list of CUs interviewed in each month
  jul <- df %>% filter(YYMM==2007) %>% select(ID)
  aug <- df %>% filter(YYMM==2008) %>% select(ID)
  sep <- df %>% filter(YYMM==2009) %>% select(ID)
  oct <- df %>% filter(YYMM==2010) %>% select(ID)  
  nov <- df %>% filter(YYMM==2011) %>% select(ID)
  dec <- df %>% filter(YYMM==2012) %>% select(ID)
  jan_21 <- df %>% filter(YYMM==2101) %>% select(ID)
  feb_21 <- df %>% filter(YYMM==2102) %>% select(ID)
  mar_21 <- df %>% filter(YYMM==2103) %>% select(ID)
  apr_21 <- df %>% filter(YYMM==2104) %>% select(ID)
  may_21 <- df %>% filter(YYMM==2105) %>% select(ID)
  jun_21 <- df %>% filter(YYMM==2106) %>% select(ID)
  jul_21 <- df %>% filter(YYMM==2107) %>% select(ID)
  aug_21 <- df %>% filter(YYMM==2108) %>% select(ID)
  sep_21 <- df %>% filter(YYMM==2109) %>% select(ID)
  
  #### Adjustment 1: Cases with "break", and the last interview is from Apr to Jun 2021 ####
  # Adjust for the cases where 
  # 1) Only Jul, Jan, and Apr interview
  # 2) Only Aug, Feb, and May interview
  # 3) Only Sep, Dar, and Jun interview

  df_break <- df %>% 
    filter((ID %in% jul$ID & !(ID %in% oct$ID) & ID %in% jan_21$ID & ID %in% apr_21$ID)|
             (ID %in% aug$ID & !(ID %in% nov$ID) & ID %in% feb_21$ID & ID %in% may_21$ID)|
             (ID %in% sep$ID & !(ID %in% dec$ID) & ID %in% mar_21$ID & ID %in% jun_21$ID))
  
  df_no_break <- df %>% filter(!(ID %in% df_break$ID))
  
  # For these CUs, note that only the EIP variables of the last interview is used. 
  # Plus, making sure the EIP variable for every interview is true is cumbersome
  # So here we care less about overall accuracy, just make sure the last interview is correct
  df_break <- df_break %>% group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    mutate(
      # Lag EIP I
      EIPI_tm1 = lag(EIPI_t, n=1, default=NA),
      EIPI_by_ck_tm1 = lag(EIPI_by_ck_t, n=1, default=NA),
      EIPI_by_dd_tm1 = lag(EIPI_by_dd_t, n=1, default=NA),
      EIPI_by_dc_tm1 = lag(EIPI_by_dc_t, n=1, default=NA),
      EIPI_for_ep_tm1 = lag(EIPI_for_ep_t, n=1, default=NA),
      EIPI_for_debt_tm1 = lag(EIPI_for_debt_t, n=1, default=NA),
      EIPI_for_sv_tm1 = lag(EIPI_for_sv_t, n=1, default=NA),
      # Second lag EIP I
      EIPI_tm2 = NA,
      EIPI_by_ck_tm2 = NA,
      EIPI_by_dd_tm2 = NA,
      EIPI_by_dc_tm2 = NA,
      EIPI_for_ep_tm2 = NA,
      EIPI_for_debt_tm2 = NA,
      EIPI_for_sv_tm2 = NA,
      # Third lag EIP I
      EIPI_tm3 = lag(EIPI_t, n=2, default=NA),
      EIPI_by_ck_tm3 = lag(EIPI_by_ck_t, n=2, default=NA),
      EIPI_by_dd_tm3 = lag(EIPI_by_dd_t, n=2, default=NA),
      EIPI_by_dc_tm3 = lag(EIPI_by_dc_t, n=2, default=NA),
      EIPI_for_ep_tm3 = lag(EIPI_for_ep_t, n=2, default=NA),
      EIPI_for_debt_tm3 = lag(EIPI_for_debt_t, n=2, default=NA),
      EIPI_for_sv_tm3 = lag(EIPI_for_sv_t, n=2, default=NA),
      # Lag EIP II
      EIPII_tm1 = lag(EIPII_t, n=1, default=0),
      EIPII_by_ck_tm1 = lag(EIPII_by_ck_t, n=1, default=0),
      EIPII_by_dd_tm1 = lag(EIPII_by_dd_t, n=1, default=0),
      EIPII_by_dc_tm1 = lag(EIPII_by_dc_t, n=1, default=0),
      EIPII_for_ep_tm1 = lag(EIPII_for_ep_t, n=1, default=0),
      EIPII_for_debt_tm1 = lag(EIPII_for_debt_t, n=1, default=0),
      EIPII_for_sv_tm1 = lag(EIPII_for_sv_t, n=1, default=0),
      # Second lag EIP II
      EIPII_tm2 = 0,
      EIPII_by_ck_tm2 = 0,
      EIPII_by_dd_tm2 = 0,
      EIPII_by_dc_tm2 = 0,
      EIPII_for_ep_tm2 = 0,
      EIPII_for_debt_tm2 = 0,
      EIPII_for_sv_tm2 = 0,
      # Third lag EIP II
      EIPII_tm3 = lag(EIPII_t, n=2, default=0),
      EIPII_by_ck_tm3 = lag(EIPII_by_ck_t, n=2, default=0),
      EIPII_by_dd_tm3 = lag(EIPII_by_dd_t, n=2, default=0),
      EIPII_by_dc_tm3 = lag(EIPII_by_dc_t, n=2, default=0),
      EIPII_for_ep_tm3 = lag(EIPII_for_ep_t, n=2, default=0),
      EIPII_for_debt_tm3 = lag(EIPII_for_debt_t, n=2, default=0),
      EIPII_for_sv_tm3 = lag(EIPII_for_sv_t, n=2, default=0),
      # Lag EIP III
      EIPIII_tm1 = lag(EIPIII_t, n=1, default=0),
      EIPIII_by_ck_tm1 = lag(EIPIII_by_ck_t, n=1, default=0),
      EIPIII_by_dd_tm1 = lag(EIPIII_by_dd_t, n=1, default=0),
      EIPIII_by_dc_tm1 = lag(EIPIII_by_dc_t, n=1, default=0),
      EIPIII_for_ep_tm1 = lag(EIPIII_for_ep_t, n=1, default=0),
      EIPIII_for_debt_tm1 = lag(EIPIII_for_debt_t, n=1, default=0),
      EIPIII_for_sv_tm1 = lag(EIPIII_for_sv_t, n=1, default=0),
      # Second lag EIP III
      EIPIII_tm2 = 0,
      EIPIII_by_ck_tm2 = 0,
      EIPIII_by_dd_tm2 = 0,
      EIPIII_by_dc_tm2 = 0,
      EIPIII_for_ep_tm2 = 0,
      EIPIII_for_debt_tm2 = 0, 
      EIPIII_for_sv_tm2 = 0,
      # Third lag EIP III
      EIPIII_tm3 = lag(EIPIII_t, n=2, default=0),
      EIPIII_by_ck_tm3 = lag(EIPIII_by_ck_t, n=2, default=0),
      EIPIII_by_dd_tm3 = lag(EIPIII_by_dd_t, n=2, default=0),
      EIPIII_by_dc_tm3 = lag(EIPIII_by_dc_t, n=2, default=0),
      EIPIII_for_ep_tm3 = lag(EIPIII_for_ep_t, n=2, default=0),
      EIPIII_for_debt_tm3 = lag(EIPIII_for_debt_t, n=2, default=0),
      EIPIII_for_sv_tm3 = lag(EIPIII_for_sv_t, n=2, default=0),
      # Lag EIP by month
      EIPI_apr_tm1 = lag(EIPI_apr_t, n=1, default=NA),
      EIPI_may_tm1 = lag(EIPI_may_t, n=1, default=NA),
      EIPI_jun_tm1 = lag(EIPI_jun_t, n=1, default=NA),
      EIPI_jul_tm1 = lag(EIPI_jul_t, n=1, default=NA),
      EIPI_aug_tm1 = lag(EIPI_aug_t, n=1, default=NA),
      EIPI_sep_tm1 = lag(EIPI_sep_t, n=1, default=NA),
      EIPI_oct_tm1 = lag(EIPI_oct_t, n=1, default=NA),
      EIPI_nov_tm1 = lag(EIPI_nov_t, n=1, default=NA),
      EIPII_dec_tm1 = lag(EIPII_dec_t, n=1, default=0),
      EIPII_jan_tm1 = lag(EIPII_jan_t, n=1, default=0),
      EIPII_feb_tm1 = lag(EIPII_feb_t, n=1, default=0),
      EIPIII_mar_tm1 = lag(EIPIII_mar_t, n=1, default=0),
      EIPIII_apr_tm1 = lag(EIPIII_apr_t, n=1, default=0),
      EIPIII_may_tm1 = lag(EIPIII_may_t, n=1, default=0),
      EIPIII_jun_tm1 = lag(EIPIII_jun_t, n=1, default=0),
      EIPIII_jul_tm1 = lag(EIPIII_jul_t, n=1, default=0),
      EIPIII_aug_tm1 = lag(EIPIII_aug_t, n=1, default=0),
      # Second lag EIP by month
      EIPI_apr_tm2 = NA,
      EIPI_may_tm2 = NA,
      EIPI_jun_tm2 = NA,
      EIPI_jul_tm2 = NA,
      EIPI_aug_tm2 = NA,
      EIPI_sep_tm2 = NA,
      EIPI_oct_tm2 = NA,
      EIPI_nov_tm2 = NA,
      EIPII_dec_tm2 = 0,
      EIPII_jan_tm2 = 0,
      EIPII_feb_tm2 = 0,
      EIPIII_mar_tm2 = 0,
      EIPIII_apr_tm2 = 0,
      EIPIII_may_tm2 = 0,
      EIPIII_jun_tm2 = 0,
      EIPIII_jul_tm2 = 0,
      EIPIII_aug_tm2 = 0,
      # Third lag EIP by month
      EIPI_apr_tm3 = lag(EIPI_apr_t, n=2, default=NA),
      EIPI_may_tm3 = lag(EIPI_may_t, n=2, default=NA),
      EIPI_jun_tm3 = lag(EIPI_jun_t, n=2, default=NA),
      EIPI_jul_tm3 = lag(EIPI_jul_t, n=2, default=NA),
      EIPI_aug_tm3 = lag(EIPI_aug_t, n=2, default=NA),
      EIPI_sep_tm3 = lag(EIPI_sep_t, n=2, default=NA),
      EIPI_oct_tm3 = lag(EIPI_oct_t, n=2, default=NA),
      EIPI_nov_tm3 = lag(EIPI_nov_t, n=2, default=NA),
      EIPII_dec_tm3 = lag(EIPII_dec_t, n=2, default=0),
      EIPII_jan_tm3 = lag(EIPII_jan_t, n=2, default=0),
      EIPII_feb_tm3 = lag(EIPII_feb_t, n=2, default=0),
      EIPIII_mar_tm3 = lag(EIPIII_mar_t, n=2, default=0),
      EIPIII_apr_tm3 = lag(EIPIII_apr_t, n=2, default=0),
      EIPIII_may_tm3 = lag(EIPIII_may_t, n=2, default=0),
      EIPIII_jun_tm3 = lag(EIPIII_jun_t, n=2, default=0),
      EIPIII_jul_tm3 = lag(EIPIII_jul_t, n=2, default=0),
      EIPIII_aug_tm3 = lag(EIPIII_aug_t, n=2, default=0))
  
  df <- bind_rows(df_no_break, df_break)
  
  df <- df %>%  
    arrange(ID) %>%
    group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    ungroup()
  
  #### Adj 2: Cases with "break", and the last interview is from Jul to Sep 2021 ####
  # 4) Only Oct, Apr, and Jul interview 
  # 5) Only Nov, May, and Aug interview 
  # 6) Only Dec, Jun, and Sep interview 
  # Two differences to adj 1: second lag EIPI is now zero, and second lag EIPII is NA
  # given the assumption that no rebates reported in the reference periods of Jan, Feb, Mar 21. 
  df_break <- df %>% 
    filter((ID %in% oct$ID & !(ID %in% jan_21$ID) & ID %in% apr_21$ID & ID %in% jul_21$ID)|
             (ID %in% nov$ID & !(ID %in% feb_21$ID) & ID %in% may_21$ID & ID %in% aug_21$ID)|
             (ID %in% dec$ID & !(ID %in% mar_21$ID) & ID %in% jun_21$ID & ID %in% sep_21$ID))
  
  df_no_break <- df %>% filter(!(ID %in% df_break$ID))
  
  # For these CUs, note that only the EIP variables of the last interview is used. 
  # Plus, making sure the EIP variable for every interview is true is cumbersome
  # So here we care less about overall accuracy, just make sure the last interview is correct
  df_break <- df_break %>% group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    mutate(
      # Lag EIP I
      EIPI_tm1 = lag(EIPI_t, n=1, default=NA),
      EIPI_by_ck_tm1 = lag(EIPI_by_ck_t, n=1, default=NA),
      EIPI_by_dd_tm1 = lag(EIPI_by_dd_t, n=1, default=NA),
      EIPI_by_dc_tm1 = lag(EIPI_by_dc_t, n=1, default=NA),
      EIPI_for_ep_tm1 = lag(EIPI_for_ep_t, n=1, default=NA),
      EIPI_for_debt_tm1 = lag(EIPI_for_debt_t, n=1, default=NA),
      EIPI_for_sv_tm1 = lag(EIPI_for_sv_t, n=1, default=NA),
      # Second lag EIP I (assuming no EIP I in Jan, Feb, and March interviews)
      EIPI_tm2 = 0,
      EIPI_by_ck_tm2 = 0,
      EIPI_by_dd_tm2 = 0,
      EIPI_by_dc_tm2 = 0,
      EIPI_for_ep_tm2 = 0,
      EIPI_for_debt_tm2 = 0,
      EIPI_for_sv_tm2 = 0,
      # Third lag EIP I
      EIPI_tm3 = lag(EIPI_t, n=2, default=NA),
      EIPI_by_ck_tm3 = lag(EIPI_by_ck_t, n=2, default=NA),
      EIPI_by_dd_tm3 = lag(EIPI_by_dd_t, n=2, default=NA),
      EIPI_by_dc_tm3 = lag(EIPI_by_dc_t, n=2, default=NA),
      EIPI_for_ep_tm3 = lag(EIPI_for_ep_t, n=2, default=NA),
      EIPI_for_debt_tm3 = lag(EIPI_for_debt_t, n=2, default=NA),
      EIPI_for_sv_tm3 = lag(EIPI_for_sv_t, n=2, default=NA),
      # Lag EIP II
      EIPII_tm1 = lag(EIPII_t, n=1, default=0),
      EIPII_by_ck_tm1 = lag(EIPII_by_ck_t, n=1, default=0),
      EIPII_by_dd_tm1 = lag(EIPII_by_dd_t, n=1, default=0),
      EIPII_by_dc_tm1 = lag(EIPII_by_dc_t, n=1, default=0),
      EIPII_for_ep_tm1 = lag(EIPII_for_ep_t, n=1, default=0),
      EIPII_for_debt_tm1 = lag(EIPII_for_debt_t, n=1, default=0),
      EIPII_for_sv_tm1 = lag(EIPII_for_sv_t, n=1, default=0),
      # Second lag EIP II
      # oct, apr, jun CU is an exception, which we address later
      EIPII_tm2 = NA,
      EIPII_by_ck_tm2 = NA,
      EIPII_by_dd_tm2 = NA,
      EIPII_by_dc_tm2 = NA,
      EIPII_for_ep_tm2 = NA,
      EIPII_for_debt_tm2 = NA,
      EIPII_for_sv_tm2 = NA,
      # Third lag EIP II
      EIPII_tm3 = lag(EIPII_t, n=2, default=0),
      EIPII_by_ck_tm3 = lag(EIPII_by_ck_t, n=2, default=0),
      EIPII_by_dd_tm3 = lag(EIPII_by_dd_t, n=2, default=0),
      EIPII_by_dc_tm3 = lag(EIPII_by_dc_t, n=2, default=0),
      EIPII_for_ep_tm3 = lag(EIPII_for_ep_t, n=2, default=0),
      EIPII_for_debt_tm3 = lag(EIPII_for_debt_t, n=2, default=0),
      EIPII_for_sv_tm3 = lag(EIPII_for_sv_t, n=2, default=0),
      # Lag EIP III
      EIPIII_tm1 = lag(EIPIII_t, n=1, default=0),
      EIPIII_by_ck_tm1 = lag(EIPIII_by_ck_t, n=1, default=0),
      EIPIII_by_dd_tm1 = lag(EIPIII_by_dd_t, n=1, default=0),
      EIPIII_by_dc_tm1 = lag(EIPIII_by_dc_t, n=1, default=0),
      EIPIII_for_ep_tm1 = lag(EIPIII_for_ep_t, n=1, default=0),
      EIPIII_for_debt_tm1 = lag(EIPIII_for_debt_t, n=1, default=0),
      EIPIII_for_sv_tm1 = lag(EIPIII_for_sv_t, n=1, default=0),
      # Second lag EIP III 
      EIPIII_tm2 = 0,
      EIPIII_by_ck_tm2 = 0,
      EIPIII_by_dd_tm2 = 0,
      EIPIII_by_dc_tm2 = 0,
      EIPIII_for_ep_tm2 = 0,
      EIPIII_for_debt_tm2 = 0, 
      EIPIII_for_sv_tm2 = 0,
      # Third lag EIP III
      EIPIII_tm3 = lag(EIPIII_t, n=2, default=0),
      EIPIII_by_ck_tm3 = lag(EIPIII_by_ck_t, n=2, default=0),
      EIPIII_by_dd_tm3 = lag(EIPIII_by_dd_t, n=2, default=0),
      EIPIII_by_dc_tm3 = lag(EIPIII_by_dc_t, n=2, default=0),
      EIPIII_for_ep_tm3 = lag(EIPIII_for_ep_t, n=2, default=0),
      EIPIII_for_debt_tm3 = lag(EIPIII_for_debt_t, n=2, default=0),
      EIPIII_for_sv_tm3 = lag(EIPIII_for_sv_t, n=2, default=0),
      # Lag EIP by month
      EIPI_apr_tm1 = lag(EIPI_apr_t, n=1, default=NA),
      EIPI_may_tm1 = lag(EIPI_may_t, n=1, default=NA),
      EIPI_jun_tm1 = lag(EIPI_jun_t, n=1, default=NA),
      EIPI_jul_tm1 = lag(EIPI_jul_t, n=1, default=NA),
      EIPI_aug_tm1 = lag(EIPI_aug_t, n=1, default=NA),
      EIPI_sep_tm1 = lag(EIPI_sep_t, n=1, default=NA),
      EIPI_oct_tm1 = lag(EIPI_oct_t, n=1, default=NA),
      EIPI_nov_tm1 = lag(EIPI_nov_t, n=1, default=NA),
      EIPII_dec_tm1 = lag(EIPII_dec_t, n=1, default=0),
      EIPII_jan_tm1 = lag(EIPII_jan_t, n=1, default=0),
      EIPII_feb_tm1 = lag(EIPII_feb_t, n=1, default=0),
      EIPIII_mar_tm1 = lag(EIPIII_mar_t, n=1, default=0),
      EIPIII_apr_tm1 = lag(EIPIII_apr_t, n=1, default=0),
      EIPIII_may_tm1 = lag(EIPIII_may_t, n=1, default=0),
      EIPIII_jun_tm1 = lag(EIPIII_jun_t, n=1, default=0),
      EIPIII_jul_tm1 = lag(EIPIII_jul_t, n=1, default=0),
      EIPIII_aug_tm1 = lag(EIPIII_aug_t, n=1, default=0),
      # Second lag EIP by month
      EIPI_apr_tm2 = 0,
      EIPI_may_tm2 = 0,
      EIPI_jun_tm2 = 0,
      EIPI_jul_tm2 = 0,
      EIPI_aug_tm2 = 0,
      EIPI_sep_tm2 = 0,
      EIPI_oct_tm2 = 0,
      EIPI_nov_tm2 = 0,
      EIPII_dec_tm2 = NA,
      EIPII_jan_tm2 = NA,
      EIPII_feb_tm2 = NA,
      EIPIII_mar_tm2 = 0,
      EIPIII_apr_tm2 = 0,
      EIPIII_may_tm2 = 0,
      EIPIII_jun_tm2 = 0,
      EIPIII_jul_tm2 = 0,
      EIPIII_aug_tm2 = 0,
      # Third lag EIP by month
      EIPI_apr_tm3 = lag(EIPI_apr_t, n=2, default=NA),
      EIPI_may_tm3 = lag(EIPI_may_t, n=2, default=NA),
      EIPI_jun_tm3 = lag(EIPI_jun_t, n=2, default=NA),
      EIPI_jul_tm3 = lag(EIPI_jul_t, n=2, default=NA),
      EIPI_aug_tm3 = lag(EIPI_aug_t, n=2, default=NA),
      EIPI_sep_tm3 = lag(EIPI_sep_t, n=2, default=NA),
      EIPI_oct_tm3 = lag(EIPI_oct_t, n=2, default=NA),
      EIPI_nov_tm3 = lag(EIPI_nov_t, n=2, default=NA),
      EIPII_dec_tm3 = lag(EIPII_dec_t, n=2, default=0),
      EIPII_jan_tm3 = lag(EIPII_jan_t, n=2, default=0),
      EIPII_feb_tm3 = lag(EIPII_feb_t, n=2, default=0),
      EIPIII_mar_tm3 = lag(EIPIII_mar_t, n=2, default=0),
      EIPIII_apr_tm3 = lag(EIPIII_apr_t, n=2, default=0),
      EIPIII_may_tm3 = lag(EIPIII_may_t, n=2, default=0),
      EIPIII_jun_tm3 = lag(EIPIII_jun_t, n=2, default=0),
      EIPIII_jul_tm3 = lag(EIPIII_jul_t, n=2, default=0),
      EIPIII_aug_tm3 = lag(EIPIII_aug_t, n=2, default=0))
  
  df <- bind_rows(df_no_break, df_break)
  
  #### Adj 3: No second lag EIPII for CUs interviewed only in Oct, Apr, and jul ####
  df_break <- df %>% 
    filter((ID %in% oct$ID & !(ID %in% jan_21$ID) & ID %in% apr_21$ID & ID %in% jul_21$ID))
  
  df_no_break <- df %>% filter(!(ID %in% df_break$ID))
  
  # care less about overall accuracy, just make sure the last obs is correct
  df_break <- df_break %>% group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    mutate(
      # Second lag EIP II
      EIPII_tm2 = 0,
      EIPII_by_ck_tm2 = 0,
      EIPII_by_dd_tm2 = 0,
      EIPII_by_dc_tm2 = 0,
      EIPII_for_ep_tm2 = 0,
      EIPII_for_debt_tm2 = 0,
      EIPII_for_sv_tm2 = 0,
      
      EIPII_dec_tm2 = 0,
      EIPII_jan_tm2 = 0,
      EIPII_feb_tm2 = 0)
  
  df <- bind_rows(df_no_break, df_break)
  
  df <- df %>%  
    arrange(ID) %>%
    group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    ungroup()

  #### Adj 4: EIPI and II lags for CUs first interviewed in Apr, May, or Jun ####
  df_list <- df %>% distinct(ID,.keep_all = TRUE)
  
  apr_first <- df_list %>% filter(YYMM==2104) %>% select(ID)
  may_first <- df_list %>% filter(YYMM==2105) %>% select(ID)
  jun_first <- df_list %>% filter(YYMM==2106) %>% select(ID)
  
  df_adj <- df %>% filter((ID %in% apr_first$ID) | (ID %in% may_first$ID) | (ID %in% jun_first$ID))
  
  df_no_adj <- df %>% filter(!(ID %in% df_adj$ID))
  
  df_adj <- df_adj %>% mutate(
    
    # Adjusting for the zero in EIPII lags
    # Given that CUs can report EIPII in feb or march
    # The following should be NAs
    EIPII_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_tm1),
    EIPII_by_ck_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_by_ck_tm1),
    EIPII_by_dd_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_by_dd_tm1),
    EIPII_by_dc_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_by_dc_tm1),
    EIPII_for_ep_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_for_ep_tm1),
    EIPII_for_debt_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_for_debt_tm1),
    EIPII_for_sv_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_for_sv_tm1),
    
    EIPII_dec_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_dec_tm1),
    EIPII_jan_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_feb_tm1),
    EIPII_feb_tm1 = ifelse((YYMM==2105|YYMM==2106), NA, EIPII_jan_tm1),
    
    EIPII_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_tm2),
    EIPII_by_ck_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_by_ck_tm2),
    EIPII_by_dd_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_by_dd_tm2),
    EIPII_by_dc_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_by_dc_tm2),
    EIPII_for_ep_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_for_ep_tm2),
    EIPII_for_debt_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_for_debt_tm2),
    EIPII_for_sv_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_for_sv_tm2),
    
    EIPII_dec_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_dec_tm2),
    EIPII_jan_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_feb_tm2),
    EIPII_feb_tm2 = ifelse((YYMM==2108|YYMM==2109), NA, EIPII_jan_tm2),
    
    # Assuming no EIPI in the reference periods of Jan, Feb, March interviews
    # The following should be zeros instead of NAs
    EIPI_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_tm1),
    EIPI_by_ck_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_by_ck_tm1),
    EIPI_by_dd_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_by_dd_tm1),
    EIPI_by_dc_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_by_dc_tm1),
    EIPI_for_ep_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_for_ep_tm1),
    EIPI_for_debt_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_for_debt_tm1),
    EIPI_for_sv_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_for_sv_tm1),
    
    EIPI_apr_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_apr_tm1),
    EIPI_may_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_may_tm1),
    EIPI_jun_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_jun_tm1),
    EIPI_jul_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_jul_tm1),
    EIPI_aug_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_aug_tm1),
    EIPI_sep_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_sep_tm1),
    EIPI_oct_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_oct_tm1),
    EIPI_nov_tm1 = ifelse((YYMM==2104|YYMM==2105|YYMM==2106), 0, EIPI_nov_tm1),
    
    EIPI_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_tm2),
    EIPI_by_ck_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_by_ck_tm2),
    EIPI_by_dd_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_by_dd_tm2),
    EIPI_by_dc_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_by_dc_tm2),
    EIPI_for_ep_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_for_ep_tm2),
    EIPI_for_debt_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_for_debt_tm2),
    EIPI_for_sv_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_for_sv_tm2),
    
    EIPI_apr_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_apr_tm2),
    EIPI_may_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_may_tm2),
    EIPI_jun_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_jun_tm2),
    EIPI_jul_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_jul_tm2),
    EIPI_aug_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_aug_tm2),
    EIPI_sep_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_sep_tm2),
    EIPI_oct_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_oct_tm2),
    EIPI_nov_tm2 = ifelse((YYMM==2107|YYMM==2108|YYMM==2109), 0, EIPI_nov_tm2))
  
  df <- bind_rows(df_no_adj, df_adj)
  
  df <- df %>%  
    arrange(ID) %>%
    group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    ungroup()
  
  
  #### Adj 5: EIPI lags should be 0 instead of NA for early interviews ####
   df <- df %>%  mutate(
     EIPI_tm1 = ifelse(YYMM==2007, 0, EIPI_tm1),
     EIPI_by_ck_tm1 = ifelse(YYMM==2007, 0, EIPI_by_ck_tm1),
     EIPI_by_dd_tm1 = ifelse(YYMM==2007, 0, EIPI_by_dd_tm1),
     EIPI_by_dc_tm1 = ifelse(YYMM==2007, 0, EIPI_by_dc_tm1),
     EIPI_for_ep_tm1 = ifelse(YYMM==2007, 0, EIPI_for_ep_tm1),
     EIPI_for_debt_tm1 = ifelse(YYMM==2007, 0, EIPI_for_debt_tm1),
     EIPI_for_sv_tm1 = ifelse(YYMM==2007, 0, EIPI_for_sv_tm1),
     
     EIPI_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_tm2), 
     EIPI_by_ck_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_by_ck_tm2), 
     EIPI_by_dd_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_by_dd_tm2), 
     EIPI_by_dc_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_by_dc_tm2), 
     EIPI_for_ep_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_for_ep_tm2),
     EIPI_for_debt_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_for_debt_tm2), 
     EIPI_for_sv_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_for_sv_tm2),
     
     EIPI_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                          YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_tm3), 
     EIPI_by_ck_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_by_ck_tm3), 
     EIPI_by_dd_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_by_dd_tm3), 
     EIPI_by_dc_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_by_dc_tm3), 
     EIPI_for_ep_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                 YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_for_ep_tm3),
     EIPI_for_debt_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                   YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_for_debt_tm3), 
     EIPI_for_sv_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                                 YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_for_sv_tm3),
     
     EIPI_apr_tm1 = ifelse(YYMM==2007, 0, EIPI_apr_tm1),
     EIPI_may_tm1 = ifelse(YYMM==2007, 0, EIPI_may_tm1),
     EIPI_jun_tm1 = ifelse(YYMM==2007, 0, EIPI_jun_tm1),
     EIPI_jul_tm1 = ifelse(YYMM==2007, 0, EIPI_jul_tm1),
     EIPI_aug_tm1 = ifelse(YYMM==2007, 0, EIPI_aug_tm1),
     EIPI_sep_tm1 = ifelse(YYMM==2007, 0, EIPI_sep_tm1),
     EIPI_oct_tm1 = ifelse(YYMM==2007, 0, EIPI_oct_tm1),
     EIPI_nov_tm1 = ifelse(YYMM==2007, 0, EIPI_nov_tm1),
     
     EIPI_apr_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_apr_tm2), 
     EIPI_may_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_may_tm2), 
     EIPI_jun_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_jun_tm2), 
     EIPI_jul_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_jul_tm2), 
     EIPI_aug_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_aug_tm2),
     EIPI_sep_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_sep_tm2),
     EIPI_oct_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_oct_tm2),
     EIPI_nov_tm2 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|YYMM==2010), 0, EIPI_nov_tm2),
     
     EIPI_apr_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_apr_tm3), 
     EIPI_may_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_may_tm3), 
     EIPI_jun_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_jun_tm3), 
     EIPI_jul_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_jul_tm3), 
     EIPI_aug_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_aug_tm3),
     EIPI_sep_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_sep_tm3),
     EIPI_oct_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_oct_tm3),
     EIPI_nov_tm3 = ifelse((YYMM==2007|YYMM==2008|YYMM==2009|
                              YYMM==2010|YYMM==2011|YYMM==2012|YYMM==2101), 0, EIPI_nov_tm3)) %>%
     
     select(
      ID, NEWID, YYMM, INTERI,
      
      # Demographics
      NUM_KIDS,NUM_ADTS, FAM_SIZE, AGE_REF, AGE2, AGE_AVG, SEX_REF,MARITAL1,CUTENURE,
      
      # expenditures
      EX_FD,EX_SN,EX_N,EX_T,EX_FD_HM,EX_FD_AW,EX_ALC,
      EX_UT_HO,EX_PC_MIS,EX_TR_GAS,EX_TBC,EX_APR,EX_HLT,EX_READ,
      EX_HS,EX_EDU,EX_ENT,EX_TRANS,EX_CACT,
      
      # EIPs
      
      # EIPI, by types, and lags
      EIPI_t, EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t, EIPI_for_ep_t,
      EIPI_for_debt_t, EIPI_for_sv_t,
      EIPI_tm1, EIPI_by_ck_tm1, EIPI_by_dd_tm1, EIPI_by_dc_tm1, EIPI_for_ep_tm1,
      EIPI_for_debt_tm1, EIPI_for_sv_tm1,
      EIPI_tm2, EIPI_by_ck_tm2, EIPI_by_dd_tm2, EIPI_by_dc_tm2, EIPI_for_ep_tm2,
      EIPI_for_debt_tm2, EIPI_for_sv_tm2,
      EIPI_tm3, EIPI_by_ck_tm3, EIPI_by_dd_tm3, EIPI_by_dc_tm3, EIPI_for_ep_tm3,
      EIPI_for_debt_tm3, EIPI_for_sv_tm3,
      # EIPII, by types, and lags
      EIPII_t, EIPII_by_ck_t, EIPII_by_dd_t, EIPII_by_dc_t, EIPII_for_ep_t,
      EIPII_for_debt_t, EIPII_for_sv_t,
      EIPII_tm1, EIPII_by_ck_tm1, EIPII_by_dd_tm1, EIPII_by_dc_tm1, EIPII_for_ep_tm1,
      EIPII_for_debt_tm1, EIPII_for_sv_tm1,
      EIPII_tm2, EIPII_by_ck_tm2, EIPII_by_dd_tm2, EIPII_by_dc_tm2, EIPII_for_ep_tm2,
      EIPII_for_debt_tm2, EIPII_for_sv_tm2,
      EIPII_tm3, EIPII_by_ck_tm3, EIPII_by_dd_tm3, EIPII_by_dc_tm3, EIPII_for_ep_tm3,
      EIPII_for_debt_tm3, EIPII_for_sv_tm3,
      # EIPIII, by types, and lags
      EIPIII_t, EIPIII_t_count, EIPIII_by_ck_t, EIPIII_by_dd_t, EIPIII_by_dc_t, EIPIII_for_ep_t,
      EIPIII_for_debt_t, EIPIII_for_sv_t,
      EIPIII_tm1, EIPIII_by_ck_tm1, EIPIII_by_dd_tm1, EIPIII_by_dc_tm1, EIPIII_for_ep_tm1,
      EIPIII_for_debt_tm1, EIPIII_for_sv_tm1,
      EIPIII_tm2, EIPIII_by_ck_tm2, EIPIII_by_dd_tm2, EIPIII_by_dc_tm2, EIPIII_for_ep_tm2,
      EIPIII_for_debt_tm2, EIPIII_for_sv_tm2,
      EIPIII_tm3, EIPIII_by_ck_tm3, EIPIII_by_dd_tm3, EIPIII_by_dc_tm3, EIPIII_for_ep_tm3,
      EIPIII_for_debt_tm3, EIPIII_for_sv_tm3,
      # EIP by month and lags
      EIPI_apr_t, EIPI_may_t, EIPI_jun_t, EIPI_jul_t, EIPI_aug_t, EIPI_sep_t, EIPI_oct_t,
      EIPI_nov_t, EIPII_dec_t, EIPII_jan_t, EIPII_feb_t,  EIPIII_mar_t, EIPIII_apr_t, EIPIII_may_t,
      EIPIII_jun_t, EIPIII_jul_t, EIPIII_aug_t,
      EIPI_apr_tm1, EIPI_may_tm1, EIPI_jun_tm1, EIPI_jul_tm1, EIPI_aug_tm1, EIPI_sep_tm1, EIPI_oct_tm1,
      EIPI_nov_tm1, EIPII_dec_tm1, EIPII_jan_tm1, EIPII_feb_tm1,  EIPIII_mar_tm1, EIPIII_apr_tm1, EIPIII_may_tm1,
      EIPIII_jun_tm1, EIPIII_jul_tm1, EIPIII_aug_tm1,
      EIPI_apr_tm2, EIPI_may_tm2, EIPI_jun_tm2, EIPI_jul_tm2, EIPI_aug_tm2, EIPI_sep_tm2, EIPI_oct_tm2,
      EIPI_nov_tm2, EIPII_dec_tm2, EIPII_jan_tm2, EIPII_feb_tm2,  EIPIII_mar_tm2, EIPIII_apr_tm2, EIPIII_may_tm2,
      EIPIII_jun_tm2, EIPIII_jul_tm2, EIPIII_aug_tm2,
      EIPI_apr_tm3, EIPI_may_tm3, EIPI_jun_tm3, EIPI_jul_tm3, EIPI_aug_tm3, EIPI_sep_tm3, EIPI_oct_tm3,
      EIPI_nov_tm3, EIPII_dec_tm3, EIPII_jan_tm3, EIPII_feb_tm3,  EIPIII_mar_tm3, EIPIII_apr_tm3, EIPIII_may_tm3,
      EIPIII_jun_tm3, EIPIII_jul_tm3, EIPIII_aug_tm3
      ) 
  
   ####  Create group dummies ####
   df <- df %>% mutate(
     r = ifelse(ID %in% list_r$ID, 1, 0), # dummy for recipient 
     ck = ifelse(ID %in% list_ck$ID, 1, 0), # dummy for receiving EIPIII only by check 
     dd = ifelse(ID %in% list_dd$ID, 1, 0), # dummy for receiving EIPIII only by direct deposit
     dc = ifelse(ID %in% list_dc$ID, 1, 0), # dummy for receiving EIPIII only by debit card 
     
     # Assign NAs 
     ck = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, ck),
     dd = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, dd),
     dc = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, dc),
     
     # Combinations
     
     ck_dd = ifelse(ck==1 & dd==1,1,0), # dummy for receiving EIPIII by both check and direct deposit 
     ck_dc = ifelse(ck==1 & dc==1,1,0), # dummy for receiving EIPIII by both check and debit card 
     dd_dc = ifelse(dd==1 & dc==1,1,0), # dummy for receiving EIPIII by both direct deposit and debit card
     ck_dd_dc = ifelse(ck==1 & dd==1 & dc==1,1,0),# dummy for receiving EIPIII by all methods
     
     # Adjusting ck and dd to 0 if ck_dd is 1
     ck = ifelse(ck_dd == 1, 0, ck),
     dd = ifelse(ck_dd == 1, 0, dd),
     
     # similar to above
     ck = ifelse(ck_dc == 1, 0, ck),
     dc = ifelse(ck_dc == 1, 0, dc),
     
     dd = ifelse(dd_dc == 1, 0, dd),
     dc = ifelse(dd_dc == 1, 0, dc),
     
     ck = ifelse(ck_dd_dc == 1, 0, ck),
     dd = ifelse(ck_dd_dc == 1, 0, dd),
     dc = ifelse(ck_dd_dc == 1, 0, dc),
     
     ck_dd = ifelse(ck_dd_dc==1, 0, ck_dd),
     ck_dc = ifelse(ck_dd_dc==1, 0, ck_dc),
     dd_dc = ifelse(ck_dd_dc==1, 0, dd_dc),
     
     # usage
     ep = ifelse(ID %in% list_ep$ID, 1, 0), # dummy for only using EIPIII "mostly for expenses" 
     debt = ifelse(ID %in% list_debt$ID, 1, 0), # dummy for using EIPIII "mostly to pay off debt"
     sv = ifelse(ID %in% list_sv$ID, 1, 0), # dummy for using EIPIII "mostly to add to savings"
     
     # Assign NAs 
     ep = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, ep),
     debt = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, debt),
     sv = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, sv),
     
     # Combinations
     ep_debt = ifelse(ep==1 & debt==1,1,0), # dummy for using EIPIII "mostly" for both "expenses" and "debt"
     ep_sv = ifelse(ep==1 & sv==1,1,0), # dummy for using EIPIII "mostly" for both "expenses" and "savings" 
     debt_sv = ifelse(debt==1 & sv==1,1,0), # dummy for using EIPIII  "mostly" for both "debt" and "savings" 
     ep_debt_sv = ifelse(debt==1 & sv==1 & ep ==1,1,0), # dummy for using EIPIII "mostly" for all three purposes
     
     # Adjusting ep and debt to 0 if ep_debt is 1
     ep = ifelse(ep_debt == 1, 0 , ep),
     debt = ifelse(ep_debt == 1, 0 , debt),
     # similar to above
     ep = ifelse(ep_sv == 1, 0 , ep),
     sv = ifelse(ep_sv == 1, 0 , sv),
     
     debt = ifelse(debt_sv == 1, 0 , debt),
     sv = ifelse(debt_sv == 1, 0 , sv),
     
     ep = ifelse(ep_debt_sv == 1, 0 , ep),
     debt = ifelse(ep_debt_sv == 1, 0 , debt),
     sv = ifelse(ep_debt_sv == 1, 0 , sv),
     
     ep_debt = ifelse(ep_debt_sv==1,0,ep_debt),
     ep_sv = ifelse(ep_debt_sv==1,0,ep_sv),
     debt_sv = ifelse(ep_debt_sv==1,0,debt_sv))
  
  return(df)
}

df <- df_modifier(df)

### 3 Panel with first difference ####

#### 3.1  Obtain and compute all variables of interest from fmli ####
# fmli_modifier is similar to df_modifier
# It selects all the relevant CE variables and construct variables used in the study
fmli_modifier <- function(fmli){
  fmli <- fmli %>% select(
    # interview info
    NEWID, YYMM, INTERI,
    # Demographics
    PERSLT18, FAM_SIZE, AGE_REF, AGE2, SEX_REF,MARITAL1,CUTENURE,
    
    # Food expenditure
    FDAWAYCQ, FDAWAYPQ, FDHOMECQ,FDHOMEPQ,
    FOODCQ, FOODPQ, ALCBEVCQ, ALCBEVPQ, 
    
    # Strict non-durables expenditure
    UTILCQ, UTILPQ, HOUSOPCQ, HOUSOPPQ,
    PUBTRACQ, PUBTRAPQ, GASMOCQ, GASMOPQ,
    PERSCACQ, PERSCAPQ, TOBACCCQ, TOBACCPQ,
    MISCCQ, MISCPQ, 
    
    # Non-durables expenditure
    APPARCQ, APPARPQ,
    HEALTHCQ, HEALTHPQ, READCQ, READPQ,
    TOTEXPCQ, TOTEXPPQ, 
    
    # Total expenditure
    HOUSCQ, HOUSPQ, EDUCACQ, EDUCAPQ,
    ENTERTCQ, ENTERTPQ, TRANSCQ, TRANSPQ,
    CASHCOCQ, CASHCOPQ) %>% 
    
    mutate(
      ID = substr(as.character(NEWID),1,6),
      
      # Demogpraphics
      NUM_KIDS = PERSLT18,
      NUM_ADTS = FAM_SIZE - PERSLT18,
      AGE_AVG = ifelse(is.na(AGE2),AGE_REF,(AGE_REF + AGE2)/2),
      
      # Four Big categories
      EX_FD = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ,
      
      EX_SN = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ +
        UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ +
        PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ +
        PERSCACQ + PERSCAPQ + TOBACCCQ + TOBACCPQ +
        MISCCQ + MISCPQ,
      
      EX_N = EX_SN +
        APPARCQ + APPARPQ + HEALTHCQ + HEALTHPQ +
        READCQ + READPQ,
      
      EX_T = TOTEXPCQ + TOTEXPPQ,
      
      # Sub-categories - food
      EX_FD_HM = FDHOMECQ + FDHOMEPQ,
      EX_FD_AW = FDAWAYCQ + FDAWAYPQ,
      EX_ALC = ALCBEVCQ + ALCBEVPQ,
      
      # Sub-categories - Strict non-durables
      EX_UT_HO = UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ,
      EX_PC_MIS = PERSCACQ + PERSCAPQ + MISCCQ + MISCPQ,
      EX_TR_GAS = PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ,
      EX_TBC = TOBACCCQ + TOBACCPQ,
      
      # Sub-categories - Non-durables 
      EX_APR = APPARCQ + APPARPQ,
      EX_HLT = HEALTHCQ + HEALTHPQ,
      EX_READ = READCQ + READPQ,
      
      # Sub-categories - Total
      EX_HS = HOUSCQ + HOUSPQ,
      EX_EDU = EDUCACQ + EDUCAPQ,
      EX_ENT = ENTERTCQ + ENTERTPQ,
      EX_TRANS = TRANSCQ + TRANSPQ,
      EX_CACT = CASHCOCQ + CASHCOPQ
    ) %>% 
    
    select(
      ID,
      
      NUM_KIDS,NUM_ADTS, FAM_SIZE, AGE_REF, AGE2, AGE_AVG, SEX_REF, MARITAL1, CUTENURE,
      
      EX_FD,EX_SN,EX_N,EX_T,EX_FD_HM,EX_FD_AW,EX_ALC,
      EX_UT_HO,EX_PC_MIS,EX_TR_GAS,EX_TBC,EX_APR,EX_HLT,EX_READ,
      EX_HS,EX_EDU,EX_ENT,EX_TRANS,EX_CACT)
  
  return(fmli)
}

# Apply the function
fmli203 <- fmli_modifier(fmli203)
fmli204 <- fmli_modifier(fmli204)
fmli211 <- fmli_modifier(fmli211)
fmli212 <- fmli_modifier(fmli212)
fmli213 <- fmli_modifier(fmli213)


#### 3.2 Computing first difference ####
# diff_maker merge selected df interviews with the their previous interviews
# It renames the variables by time (t and tm1), and computes first difference

diff_maker <- function(df,fmli_tm1){
  # merge certain interviews with previous interviews 
  df <- merge(df,fmli_tm1,by="ID")
  # rename variables by time
  df <- df %>% rename(
    
    FAM_SIZE_t = FAM_SIZE.x, AGE_REF_t = AGE_REF.x, AGE2_t = AGE2.x, AGE_AVG_t = AGE_AVG.x,
    SEX_REF_t = SEX_REF.x, MARITAL1_t = MARITAL1.x, NUM_ADTS_t = NUM_ADTS.x,
    NUM_KIDS_t = NUM_KIDS.x, CUTENURE_t = CUTENURE.x,
    
    FAM_SIZE_tm1 = FAM_SIZE.y, AGE_REF_tm1 = AGE_REF.y, AGE2_tm1 = AGE2.y, AGE_AVG_tm1 = AGE_AVG.y,
    SEX_REF_tm1 = SEX_REF.y, MARITAL1_tm1 = MARITAL1.y, NUM_ADTS_tm1 = NUM_ADTS.y,
    NUM_KIDS_tm1 = NUM_KIDS.y, CUTENURE_tm1 = CUTENURE.y,
    
    EX_FD_t = EX_FD.x, EX_SN_t = EX_SN.x, EX_N_t = EX_N.x, EX_T_t = EX_T.x,
    EX_FD_HM_t = EX_FD_HM.x, EX_FD_AW_t = EX_FD_AW.x, EX_ALC_t = EX_ALC.x,
    EX_UT_HO_t = EX_UT_HO.x, EX_PC_MIS_t = EX_PC_MIS.x, EX_TR_GAS_t = EX_TR_GAS.x,
    EX_TBC_t = EX_TBC.x, EX_APR_t = EX_APR.x, EX_HLT_t = EX_HLT.x, EX_READ_t = EX_READ.x,
    EX_HS_t = EX_HS.x, EX_EDU_t = EX_EDU.x, EX_ENT_t = EX_ENT.x, EX_TRANS_t = EX_TRANS.x,
    EX_CACT_t = EX_CACT.x, 
    
    EX_FD_tm1 = EX_FD.y, EX_SN_tm1 = EX_SN.y, EX_N_tm1 = EX_N.y, EX_T_tm1 = EX_T.y,
    EX_FD_HM_tm1 = EX_FD_HM.y, EX_FD_AW_tm1 = EX_FD_AW.y, EX_ALC_tm1 = EX_ALC.y,
    EX_UT_HO_tm1 = EX_UT_HO.y, EX_PC_MIS_tm1 = EX_PC_MIS.y, EX_TR_GAS_tm1 = EX_TR_GAS.y,
    EX_TBC_tm1 = EX_TBC.y, EX_APR_tm1 = EX_APR.y, EX_HLT_tm1 = EX_HLT.y, EX_READ_tm1 = EX_READ.y,
    EX_HS_tm1 = EX_HS.y, EX_EDU_tm1 = EX_EDU.y, EX_ENT_tm1 = EX_ENT.y, EX_TRANS_tm1 = EX_TRANS.y,
    EX_CACT_tm1 = EX_CACT.y) %>% 
    
    # Computing first difference 
    mutate(
      # change in family size
      d_FAM_SIZE_t = FAM_SIZE_t - FAM_SIZE_tm1,
      d_NUM_ADTS_t = NUM_ADTS_t - NUM_ADTS_tm1,
      d_NUM_KIDS_t = NUM_KIDS_t - NUM_KIDS_tm1,
      
      # change in age
      d_AGE_REF_t = AGE_REF_t - AGE_REF_tm1,
      d_AGE_2 = AGE2_t - AGE2_tm1,
      
      # 4 main expenditure categories
      d_EX_FD_t = as.numeric(format(round(EX_FD_t - EX_FD_tm1,1),scientific=F)),
      d_EX_SN_t = as.numeric(format(round(EX_SN_t - EX_SN_tm1,1),scientific=F)),
      d_EX_N_t = as.numeric(format(round(EX_N_t - EX_N_tm1,1),scientific=F)),
      d_EX_T_t = as.numeric(format(round(EX_T_t - EX_T_tm1,1),scientific=F)),
      
      # food sub-category 
      d_EX_FD_HM_t = as.numeric(format(round(EX_FD_HM_t - EX_FD_HM_tm1,1),scientific=F)),
      d_EX_FD_AW_t = as.numeric(format(round(EX_FD_AW_t - EX_FD_AW_tm1,1),scientific=F)),
      d_EX_ALC_t = as.numeric(format(round(EX_ALC_t - EX_ALC_tm1,1),scientific=F)),
      
      # Strict Non-durables
      d_EX_UT_HO_t = as.numeric(format(round(EX_UT_HO_t - EX_UT_HO_tm1,1),scientific=F)),
      d_EX_PC_MIS_t = as.numeric(format(round(EX_PC_MIS_t - EX_PC_MIS_tm1,1),scientific=F)),
      d_EX_TR_GAS_t = as.numeric(format(round(EX_TR_GAS_t - EX_TR_GAS_tm1,1),scientific=F)),
      d_EX_TBC_t = as.numeric(format(round(EX_TBC_t - EX_TBC_tm1,1),scientific=F)),  
      
      # non-durables
      d_EX_APR_t = as.numeric(format(round(EX_APR_t - EX_APR_tm1,1),scientific=F)),
      d_EX_HLT_t = as.numeric(format(round(EX_HLT_t - EX_HLT_tm1,1),scientific=F)),
      d_EX_READ_t = as.numeric(format(round(EX_READ_t - EX_READ_tm1,1),scientific=F)), 
      
      # total
      d_EX_HS_t = as.numeric(format(round(EX_HS_t - EX_HS_tm1,1),scientific=F)),
      d_EX_EDU_t = as.numeric(format(round(EX_EDU_t - EX_EDU_tm1,1),scientific=F)),
      d_EX_ENT_t = as.numeric(format(round(EX_ENT_t - EX_ENT_tm1,1),scientific=F)),
      d_EX_TRANS_t = as.numeric(format(round(EX_TRANS_t - EX_TRANS_tm1,1),scientific=F)), 
      d_EX_CACT_t = as.numeric(format(round(EX_CACT_t - EX_CACT_tm1,1),scientific=F))
    )
  
  return(df)
}

#### 3.3 Incorporating first difference into df ####
# To increase coding efficiency, we reduce df to df by quarter
df_20q4 <- df %>% filter(YYMM==2010|YYMM==2011|YYMM==2012)
df_21q1 <- df %>% filter(YYMM==2101|YYMM==2102|YYMM==2103)
df_21q2 <- df %>% filter(YYMM==2104|YYMM==2105|YYMM==2106)
df_21q3 <- df %>% filter(YYMM==2107|YYMM==2108|YYMM==2109)

# Each df by quarter are merged with the corresponding interviews in the last quarter 
df_20q4 <- diff_maker(df_20q4, fmli203)
df_21q1 <- diff_maker(df_21q1, fmli204)
df_21q2 <- diff_maker(df_21q2, fmli211)
df_21q3 <- diff_maker(df_21q3, fmli212)

# Binding df by quarters to form a complete df
df <- bind_rows(df_20q4, df_21q1, df_21q2, df_21q3)

df <- df %>% 
  arrange(ID) %>% 
  group_by(ID) %>%
  arrange(YYMM, .by_group = TRUE) %>%
  ungroup()

### 4 Computing average weights, the first documented income, and liquidity ####
# wts_inc_liq_creator extract weights, income, and liquidity from fmli files
wts_inc_liq_creator <- function(fmli){
  fmli_weights_income <- fmli %>% select(NEWID,FINLWT21,FINCBTXM,LIQUDYR,LIQUDYRX) %>% 
    mutate(
      ID = substr(as.character(NEWID),1,6),
      # if CU reports no liquid accounts, then liquidity is set to 0
      LIQUDYRX = ifelse(LIQUDYR==2 & !is.na(LIQUDYR),0,LIQUDYRX)
    ) %>% select(ID,FINLWT21,FINCBTXM,LIQUDYRX)
  return(fmli_weights_income)}

# Apply wts_inc_liq_creator to each fmli file
fmli203_wts_inc_liq <- wts_inc_liq_creator(fmli203_copy)
fmli204_wts_inc_liq <- wts_inc_liq_creator(fmli204_copy)
fmli211_wts_inc_liq <- wts_inc_liq_creator(fmli211_copy)
fmli212_wts_inc_liq <- wts_inc_liq_creator(fmli212_copy)
fmli213_wts_inc_liq <- wts_inc_liq_creator(fmli213_copy)

# Merge to obtain the weights, income, and liquidity in each interview  
wts_inc_liq <- merge(fmli203_wts_inc_liq,fmli204_wts_inc_liq,by="ID",all=TRUE)

wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_203 = FINLWT21.x, FINLWT21_204 = FINLWT21.y, FINCBTXM_203 = FINCBTXM.x,
  FINCBTXM_204 = FINCBTXM.y, LIQUDYRX_203 = LIQUDYRX.x, LIQUDYRX_204 = LIQUDYRX.y)

wts_inc_liq <- merge(wts_inc_liq,fmli211_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_211 = FINLWT21, FINCBTXM_211 = FINCBTXM, LIQUDYRX_211 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli212_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_212 = FINLWT21, FINCBTXM_212 = FINCBTXM, LIQUDYRX_212 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli213_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_213 = FINLWT21, FINCBTXM_213 = FINCBTXM, LIQUDYRX_213 = LIQUDYRX)

# Average weights 
wts_inc_liq$FINLWT21_AVG <- rowMeans(wts_inc_liq[,c("FINLWT21_203",
                                                    "FINLWT21_204",
                                                    "FINLWT21_211",
                                                    "FINLWT21_212",
                                                    "FINLWT21_213")], 
                                     na.rm=TRUE)

# First income 
wts_inc_liq$FINCBTXM_FST <- ifelse(!is.na(wts_inc_liq$FINCBTXM_203),wts_inc_liq$FINCBTXM_203,
                                          ifelse(!is.na(wts_inc_liq$FINCBTXM_204),wts_inc_liq$FINCBTXM_204,
                                                 ifelse(!is.na(wts_inc_liq$FINCBTXM_211),wts_inc_liq$FINCBTXM_211,
                                                        ifelse(!is.na(wts_inc_liq$FINCBTXM_212),wts_inc_liq$FINCBTXM_212,wts_inc_liq$FINCBTXM_213))))

# Liquidity 
wts_inc_liq$LIQUDYRX <- ifelse(!is.na(wts_inc_liq$LIQUDYRX_203),wts_inc_liq$LIQUDYRX_203,
                                      ifelse(!is.na(wts_inc_liq$LIQUDYRX_204),wts_inc_liq$LIQUDYRX_204,
                                             ifelse(!is.na(wts_inc_liq$LIQUDYRX_211),wts_inc_liq$LIQUDYRX_211,
                                                    ifelse(!is.na(wts_inc_liq$LIQUDYRX_212),wts_inc_liq$LIQUDYRX_212,wts_inc_liq$LIQUDYRX_213))))

wts_inc_liq <- wts_inc_liq %>% select(ID,FINLWT21_AVG,FINCBTXM_FST,LIQUDYRX)

### 5 Computing average expenditures (for scaling) ####

#### 5.1 fmli_expd_creator select expd variables ####

fmli_expd_creator <- function(fmli){
  # Only the expd variables
  fmli <- fmli %>% select(1,11:29)
  return(fmli)
}

# Apply to each fmli
fmli203_expd <- fmli_expd_creator(fmli203)
fmli204_expd <- fmli_expd_creator(fmli204)
fmli211_expd <- fmli_expd_creator(fmli211)
fmli212_expd <- fmli_expd_creator(fmli212)
fmli213_expd <- fmli_expd_creator(fmli213)

#### 5.2 Merge expd in all periods ####
expd <- merge(fmli203_expd,fmli204_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_203 = EX_FD.x, EX_SN_203 = EX_SN.x, EX_N_203 = EX_N.x, EX_T_203 = EX_T.x,
  EX_FD_HM_203 = EX_FD_HM.x, EX_FD_AW_203 = EX_FD_AW.x, EX_ALC_203 = EX_ALC.x,
  EX_UT_HO_203 = EX_UT_HO.x, EX_PC_MIS_203 = EX_PC_MIS.x, EX_TR_GAS_203 = EX_TR_GAS.x,
  EX_TBC_203 = EX_TBC.x, EX_APR_203 = EX_APR.x, EX_HLT_203 = EX_HLT.x, EX_READ_203 = EX_READ.x,
  EX_HS_203 = EX_HS.x, EX_EDU_203 = EX_EDU.x, EX_ENT_203 = EX_ENT.x, EX_TRANS_203 = EX_TRANS.x,
  EX_CACT_203 = EX_CACT.x,
  
  EX_FD_204 = EX_FD.y, EX_SN_204 = EX_SN.y, EX_N_204 = EX_N.y, EX_T_204 = EX_T.y,
  EX_FD_HM_204 = EX_FD_HM.y, EX_FD_AW_204 = EX_FD_AW.y, EX_ALC_204 = EX_ALC.y,
  EX_UT_HO_204 = EX_UT_HO.y, EX_PC_MIS_204 = EX_PC_MIS.y, EX_TR_GAS_204 = EX_TR_GAS.y,
  EX_TBC_204 = EX_TBC.y, EX_APR_204 = EX_APR.y, EX_HLT_204 = EX_HLT.y, EX_READ_204 = EX_READ.y,
  EX_HS_204 = EX_HS.y, EX_EDU_204 = EX_EDU.y, EX_ENT_204 = EX_ENT.y, EX_TRANS_204 = EX_TRANS.y,
  EX_CACT_204 = EX_CACT.y)

# merge expd with fmli211
expd <- merge(expd,fmli211_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_211 = EX_FD, EX_SN_211 = EX_SN, EX_N_211 = EX_N, EX_T_211 = EX_T,
  EX_FD_HM_211 = EX_FD_HM, EX_FD_AW_211 = EX_FD_AW, EX_ALC_211 = EX_ALC,
  EX_UT_HO_211 = EX_UT_HO, EX_PC_MIS_211 = EX_PC_MIS, EX_TR_GAS_211 = EX_TR_GAS,
  EX_TBC_211 = EX_TBC, EX_APR_211 = EX_APR, EX_HLT_211 = EX_HLT, EX_READ_211 = EX_READ,
  EX_HS_211 = EX_HS, EX_EDU_211 = EX_EDU, EX_ENT_211 = EX_ENT, EX_TRANS_211 = EX_TRANS,
  EX_CACT_211 = EX_CACT)

# merge expd with fmli212
expd <- merge(expd,fmli212_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_212 = EX_FD, EX_SN_212 = EX_SN, EX_N_212 = EX_N, EX_T_212 = EX_T,
  EX_FD_HM_212 = EX_FD_HM, EX_FD_AW_212 = EX_FD_AW, EX_ALC_212 = EX_ALC,
  EX_UT_HO_212 = EX_UT_HO, EX_PC_MIS_212 = EX_PC_MIS, EX_TR_GAS_212 = EX_TR_GAS,
  EX_TBC_212 = EX_TBC, EX_APR_212 = EX_APR, EX_HLT_212 = EX_HLT, EX_READ_212 = EX_READ,
  EX_HS_212 = EX_HS, EX_EDU_212 = EX_EDU, EX_ENT_212 = EX_ENT, EX_TRANS_212 = EX_TRANS,
  EX_CACT_212 = EX_CACT)

# merge expd with fmli213
expd <- merge(expd,fmli213_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_213 = EX_FD, EX_SN_213 = EX_SN, EX_N_213 = EX_N, EX_T_213 = EX_T,
  EX_FD_HM_213 = EX_FD_HM, EX_FD_AW_213 = EX_FD_AW, EX_ALC_213 = EX_ALC,
  EX_UT_HO_213 = EX_UT_HO, EX_PC_MIS_213 = EX_PC_MIS, EX_TR_GAS_213 = EX_TR_GAS,
  EX_TBC_213 = EX_TBC, EX_APR_213 = EX_APR, EX_HLT_213 = EX_HLT, EX_READ_213 = EX_READ,
  EX_HS_213 = EX_HS, EX_EDU_213 = EX_EDU, EX_ENT_213 = EX_ENT, EX_TRANS_213 = EX_TRANS,
  EX_CACT_213 = EX_CACT)

#### 5.3 Computing average expenditures #####

expd <- expd %>% mutate(
  EX_FD_AVG = rowMeans(expd[,c("EX_FD_203","EX_FD_204",
                               "EX_FD_211", "EX_FD_212", "EX_FD_213")], na.rm=TRUE),
  
  EX_SN_AVG = rowMeans(expd[,c("EX_SN_203","EX_SN_204",
                               "EX_SN_211", "EX_SN_212", "EX_SN_213")], na.rm=TRUE),
  
  EX_N_AVG = rowMeans(expd[,c("EX_N_203","EX_N_204",
                              "EX_N_211", "EX_N_212", "EX_N_213")], na.rm=TRUE),
  
  EX_T_AVG = rowMeans(expd[,c("EX_T_203","EX_T_204",
                              "EX_T_211", "EX_T_212","EX_T_213")], na.rm=TRUE),
  
  EX_FD_HM_AVG = rowMeans(expd[,c("EX_FD_HM_203","EX_FD_HM_204",
                                  "EX_FD_HM_211", "EX_FD_HM_212","EX_FD_HM_213" )], na.rm=TRUE),
  
  EX_FD_AW_AVG = rowMeans(expd[,c("EX_FD_AW_203","EX_FD_AW_204",
                                  "EX_FD_AW_211", "EX_FD_AW_212", "EX_FD_AW_213")], na.rm=TRUE),
  
  EX_ALC_AVG = rowMeans(expd[,c("EX_ALC_203","EX_ALC_204",
                                "EX_ALC_211", "EX_ALC_212","EX_ALC_213")], na.rm=TRUE),
  
  EX_UT_HO_AVG = rowMeans(expd[,c("EX_UT_HO_203","EX_UT_HO_204",
                                  "EX_UT_HO_211", "EX_UT_HO_212","EX_UT_HO_213" )], na.rm=TRUE),
  
  EX_PC_MIS_AVG = rowMeans(expd[,c("EX_PC_MIS_203","EX_PC_MIS_204",
                                   "EX_PC_MIS_211", "EX_PC_MIS_212", "EX_PC_MIS_213")], na.rm=TRUE),
  
  EX_TR_GAS_AVG = rowMeans(expd[,c("EX_TR_GAS_203","EX_TR_GAS_204",
                                   "EX_TR_GAS_211", "EX_TR_GAS_212", "EX_TR_GAS_213")], na.rm=TRUE),
  
  EX_TBC_AVG = rowMeans(expd[,c("EX_TBC_203","EX_TBC_204",
                                "EX_TBC_211", "EX_TBC_212", "EX_TBC_213")], na.rm=TRUE),
  
  EX_APR_AVG = rowMeans(expd[,c("EX_APR_203","EX_APR_204",
                                "EX_APR_211", "EX_APR_212","EX_APR_213")], na.rm=TRUE),
  
  EX_HLT_AVG = rowMeans(expd[,c("EX_HLT_203","EX_HLT_204",
                                "EX_HLT_211", "EX_HLT_212","EX_HLT_213")], na.rm=TRUE),
  
  EX_READ_AVG = rowMeans(expd[,c("EX_READ_203","EX_READ_204",
                                 "EX_READ_211", "EX_READ_212","EX_READ_213")], na.rm=TRUE),
  
  EX_HS_AVG = rowMeans(expd[,c("EX_HS_203","EX_HS_204",
                               "EX_HS_211", "EX_HS_212","EX_HS_213")], na.rm=TRUE),
  
  EX_EDU_AVG = rowMeans(expd[,c("EX_EDU_203","EX_EDU_204",
                                "EX_EDU_211", "EX_EDU_212","EX_EDU_213")], na.rm=TRUE),
  
  EX_ENT_AVG = rowMeans(expd[,c("EX_ENT_203","EX_ENT_204",
                                "EX_ENT_211", "EX_ENT_212","EX_ENT_213" )], na.rm=TRUE),
  
  EX_TRANS_AVG = rowMeans(expd[,c("EX_TRANS_203","EX_TRANS_204",
                                  "EX_TRANS_211", "EX_TRANS_212","EX_TRANS_213")], na.rm=TRUE),
  
  EX_CACT_AVG = rowMeans(expd[,c("EX_CACT_203","EX_CACT_204",
                                 "EX_CACT_211", "EX_CACT_212","EX_CACT_213")], na.rm=TRUE)) %>% 
  
  select(ID,EX_FD_AVG, EX_SN_AVG, EX_N_AVG, EX_T_AVG, EX_FD_HM_AVG, EX_FD_AW_AVG,
         EX_ALC_AVG, EX_UT_HO_AVG, EX_PC_MIS_AVG, EX_TR_GAS_AVG, EX_TBC_AVG, EX_APR_AVG,
         EX_HLT_AVG, EX_READ_AVG, EX_HS_AVG, EX_EDU_AVG, EX_ENT_AVG, EX_TRANS_AVG, EX_CACT_AVG) # Only ID and averages

### 6 Merge 4 and 5 results into df and re-arrange variables ####

#### 6.1 Merge weights, income, liquidity, and average expenditure into df ####
# merge df with weights, income, and liquidity
df <- merge(df, wts_inc_liq, by="ID")
# merge df with avergae expenditure 
df <- merge(df, expd, by="ID")

#### 6.2 Final re-arrange ####
df <- df %>% select(
  ID, NEWID, YYMM, INTERI, FINLWT21_AVG,FINCBTXM_FST,LIQUDYRX,
  
  # basic EIPs
  EIPI_t, EIPI_tm1, EIPI_tm2, EIPI_tm3,
  EIPII_t, EIPII_tm1, EIPII_tm2, EIPII_tm3,  
  EIPIII_t, EIPIII_t_count, EIPIII_tm1, EIPIII_tm2, EIPIII_tm3,
  
  # Basic expenditure 
  d_EX_FD_t, d_EX_SN_t, d_EX_N_t, d_EX_T_t,
  
  # Controls   
  d_NUM_ADTS_t, d_NUM_KIDS_t, AGE_AVG_t, 
  
  # Demographics for cleaning
  d_AGE_REF_t, d_AGE_2, d_FAM_SIZE_t, NUM_KIDS_t, NUM_ADTS_t, FAM_SIZE_t, 
  AGE_REF_t, AGE2_t, SEX_REF_t, SEX_REF_tm1, MARITAL1_t, MARITAL1_tm1, CUTENURE_t,
  NUM_KIDS_tm1, NUM_ADTS_tm1, 
  
  # More EIPIs
  EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t, EIPI_for_ep_t, EIPI_for_debt_t, EIPI_for_sv_t, 
  
  EIPI_by_ck_tm1, EIPI_by_dd_tm1, EIPI_by_dc_tm1, EIPI_for_ep_tm1, EIPI_for_debt_tm1, EIPI_for_sv_tm1, 
  
  EIPI_by_ck_tm2, EIPI_by_dd_tm2, EIPI_by_dc_tm2, EIPI_for_ep_tm2, EIPI_for_debt_tm2, EIPI_for_sv_tm2, 
  
  EIPI_by_ck_tm3, EIPI_by_dd_tm3, EIPI_by_dc_tm3, EIPI_for_ep_tm3, EIPI_for_debt_tm3, EIPI_for_sv_tm3, 
  
  # More EIPIIs
  EIPII_by_ck_t, EIPII_by_dd_t, EIPII_by_dc_t, EIPII_for_ep_t, EIPII_for_debt_t, EIPII_for_sv_t, 
  
  EIPII_by_ck_tm1, EIPII_by_dd_tm1, EIPII_by_dc_tm1, EIPII_for_ep_tm1, EIPII_for_debt_tm1, EIPII_for_sv_tm1, 
  
  EIPII_by_ck_tm2, EIPII_by_dd_tm2, EIPII_by_dc_tm2, EIPII_for_ep_tm2, EIPII_for_debt_tm2, EIPII_for_sv_tm2, 
  
  EIPII_by_ck_tm3, EIPII_by_dd_tm3, EIPII_by_dc_tm3, EIPII_for_ep_tm3, EIPII_for_debt_tm3, EIPII_for_sv_tm3, 
  
  # More EIPIIIs
  EIPIII_by_ck_t, EIPIII_by_dd_t, EIPIII_by_dc_t, EIPIII_for_ep_t, EIPIII_for_debt_t, EIPIII_for_sv_t, 
  
  EIPIII_by_ck_tm1, EIPIII_by_dd_tm1, EIPIII_by_dc_tm1, EIPIII_for_ep_tm1, EIPIII_for_debt_tm1, EIPIII_for_sv_tm1, 
  
  EIPIII_by_ck_tm2, EIPIII_by_dd_tm2, EIPIII_by_dc_tm2, EIPIII_for_ep_tm2, EIPIII_for_debt_tm2, EIPIII_for_sv_tm2, 
  
  EIPIII_by_ck_tm3, EIPIII_by_dd_tm3, EIPIII_by_dc_tm3, EIPIII_for_ep_tm3, EIPIII_for_debt_tm3, EIPIII_for_sv_tm3, 
  
  # EIP by month
  # EIPI_apr_t, EIPI_may_t, EIPI_jun_t, EIPI_jul_t, EIPI_aug_t, EIPI_sep_t, EIPI_oct_t,
  # EIPI_nov_t, EIPII_dec_t, EIPII_jan_t, EIPII_feb_t,  EIPIII_mar_t, EIPIII_apr_t, EIPIII_may_t,
  # EIPIII_jun_t, EIPIII_jul_t, EIPIII_aug_t,
  
  # EIP Status dummies
  r, ck, dd, dc, ck_dd, ck_dc, dd_dc, ck_dd_dc,
  ep, debt, sv, ep_debt, ep_sv, debt_sv, ep_debt_sv, 
  
  # Expenditure levels
  EX_FD_t, EX_FD_tm1, EX_SN_t, EX_SN_tm1, EX_N_t, EX_N_tm1, EX_T_t, EX_T_tm1,
  
  # Expenditure Sub-categories
  d_EX_FD_HM_t, d_EX_FD_AW_t, d_EX_ALC_t, d_EX_UT_HO_t, d_EX_PC_MIS_t, d_EX_TR_GAS_t,
  d_EX_TBC_t, d_EX_APR_t, d_EX_HLT_t, d_EX_READ_t, d_EX_HS_t, d_EX_EDU_t, d_EX_ENT_t,
  d_EX_TRANS_t, d_EX_CACT_t,
  
  # Scalers
  EX_FD_AVG, EX_SN_AVG, EX_N_AVG, EX_T_AVG,
  
  EX_FD_HM_AVG, EX_FD_AW_AVG,
  EX_ALC_AVG, EX_UT_HO_AVG, EX_PC_MIS_AVG, EX_TR_GAS_AVG, EX_TBC_AVG, EX_APR_AVG,
  EX_HLT_AVG, EX_READ_AVG, EX_HS_AVG, EX_EDU_AVG, EX_ENT_AVG, EX_TRANS_AVG,
  EX_CACT_AVG
)

### 7 Clean df to get the two samples ####

#### 7.1 All households sample ####
# We reduce the sample step by step s
# so we can trace out the change in sample size after each step

# Drop if lives in student housing
df_all_cu <- df %>% filter(CUTENURE_t != 6)

# Drop if age_ref < 21 or age2 < 21 (or > 85)
df_all_cu <- df_all_cu %>% filter(AGE_REF_t >= 21 & AGE_REF_t <= 85)
df_all_cu <- df_all_cu %>% filter(is.na(AGE2_t) | (AGE2_t >= 21 & AGE2_t <= 85))

# Drop if age_ref change is greater than 1 or less than 0 (if the sex of the reference person is the same)
df_all_cu$drop <- ifelse(((df_all_cu$d_AGE_REF_t > 1 | df_all_cu$d_AGE_REF_t < 0) & df_all_cu$SEX_REF_t == df_all_cu$SEX_REF_tm1), 1, 0)
df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0)

#  Drop if age2 change is greater than 1 or less than 0 (if the reference person has the same sex or marital status)
df_all_cu$drop <- ifelse(((df_all_cu$d_AGE_2 > 1 | df_all_cu$d_AGE_2 < 0) & df_all_cu$SEX_REF_t == df_all_cu$SEX_REF_tm1
                          & df_all_cu$MARITAL1_t == df_all_cu$MARITAL1_tm1), 1, 0)
df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0|is.na(df_all_cu$drop))

# Drop if change in family size is greater than or less than 3 in absolute values
df_all_cu <- df_all_cu %>% filter (d_FAM_SIZE_t <= 3 & d_FAM_SIZE_t >= -3)

# drop bottom 1 percent of CUs in terms of non-durable consumption in each month after adjustment for CU size
df_all_cu$EX_N_PC <- df_all_cu$EX_N_t / (df_all_cu$NUM_ADTS_t + 0.6 *df_all_cu$NUM_KIDS_t)

df_all_cu <- df_all_cu %>% mutate(
  TT = ifelse(YYMM==2010,0,
              ifelse(YYMM==2011,1,
                     ifelse(YYMM==2012,2,
                            ifelse(YYMM==2101,3,
                                   ifelse(YYMM==2102,4,
                                          ifelse(YYMM==2103,5,
                                                 ifelse(YYMM==2104,6,
                                                        ifelse(YYMM==2105,7,
                                                               ifelse(YYMM==2106,8,
                                                                      ifelse(YYMM==2107,9,
                                                                             ifelse(YYMM==2108,10,11))))))))))))

# Quantile regression of per capita consumption on time trend for the bottom 1%
qr_bot <- rq(data=df_all_cu,EX_N_PC~TT,tau=0.01)

summary(qr_bot)

df_all_cu$fit_val_bot <- qr_bot[["fitted.values"]]

df_all_cu$drop <- ifelse(df_all_cu$fit_val_bot > df_all_cu$EX_N_PC, 1, 0)
df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0) %>% select(-c(EX_N_PC,fit_val_bot,drop,TT))
write.csv(df_all_cu,"df_all_cu.csv",row.names = FALSE)

#### 7.2 Final Panel ####

# drop if lives in student housing
df_f <- df %>% filter(CUTENURE_t != 6)

# drop if age_ref < 21 or age2 < 21
df_f <- df_f %>% filter(AGE_REF_t >= 21)
df_f <- df_f %>% filter(is.na(AGE2_t) | AGE2_t >= 21)

# drop if age_ref change is greater than 1 or less than 0 (if the sex of the reference person is the same)
df_f$drop <- ifelse(((df_f$d_AGE_REF_t > 1 | df_f$d_AGE_REF_t < 0) & df_f$SEX_REF_t == df_f$SEX_REF_tm1), 1, 0)
df_f <- df_f %>% filter(df_f$drop==0)

#  drop if age2 change is greater than 1 or less than 0 (if the reference person has the same sex or marital status)
df_f$drop <- ifelse(((df_f$d_AGE_2 > 1 | df_f$d_AGE_2 < 0) & df_f$SEX_REF_t == df_f$SEX_REF_tm1
                     & df_f$MARITAL1_t == df_f$MARITAL1_tm1), 1, 0)
df_f <- df_f %>% filter(df_f$drop==0|is.na(df_f$drop))

# drop if change in family size is greater than or less than 3 in absolute values
df_f <- df_f %>% filter (d_FAM_SIZE_t <= 3 & d_FAM_SIZE_t >= -3)

# drop bottom 1 percent of CUs in terms of non-durable consumption in each month after adjustment for CU size
df_f$EX_N_PC <- df_f$EX_N_t / (df_f$NUM_ADTS_t + 0.6 *df_f$NUM_KIDS_t)

df_f %>%
  group_by(YYMM) %>%
  summarize(quant1 = quantile(EX_N_PC, probs = 0.01)) %>%
  ungroup()

df_f$drop <- ifelse((df_f$YYMM == 2010 & df_f$EX_N_PC <= 922), 1, 
                    ifelse((df_f$YYMM == 2011 & df_f$EX_N_PC <= 662), 1,
                           ifelse((df_f$YYMM == 2012 & df_f$EX_N_PC <= 787), 1, 
                                  ifelse((df_f$YYMM == 2101 & df_f$EX_N_PC <= 751), 1,
                                         ifelse((df_f$YYMM == 2102 & df_f$EX_N_PC <= 782), 1,
                                                ifelse((df_f$YYMM == 2103 & df_f$EX_N_PC <= 760), 1,
                                                       ifelse((df_f$YYMM == 2104 & df_f$EX_N_PC <= 798), 1,
                                                              ifelse((df_f$YYMM == 2105 & df_f$EX_N_PC <= 727), 1,
                                                                     ifelse((df_f$YYMM == 2106 & df_f$EX_N_PC <= 679), 1,
                                                                            ifelse((df_f$YYMM == 2107 & df_f$EX_N_PC <= 713), 1,
                                                                                   ifelse((df_f$YYMM == 2108 & df_f$EX_N_PC <= 541), 1,
                                                                                          ifelse((df_f$YYMM == 2109 & df_f$EX_N_PC <= 632), 1, 0
                                                                            ))))))))))))
df_f <- df_f %>% filter(df_f$drop==0)

# Drop high income
df_f$MARITAL_t <- ifelse(df_f$MARITAL1_t == 1, 1, 0)

#### Income cutoff table ####

# For single, without kids

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 75000 & FINCBTXM_FST > 50000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 100000 & FINCBTXM_FST > 75000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 125000 & FINCBTXM_FST > 100000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
table(check$r)

# For single, with kids

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 125000 & FINCBTXM_FST > 100000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 175000 & FINCBTXM_FST > 150000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
table(check$r)

# For married couple, no kids
check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 175000 & FINCBTXM_FST > 150000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
table(check$r)

#225,000

# For married couple, with kids

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 175000 & FINCBTXM_FST > 150000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
table(check$r)

# 225,000

# For adults, no kids

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 450000 & FINCBTXM_FST > 425000)
table(check$r)

#425,000

# For adults, with kids

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
table(check$r)

check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
table(check$r)


# cutoff
df_f$drop <- ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t==1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 125000), 0,
                    ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t==1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 275000), 0,
                           ifelse((df_f$MARITAL_t==1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 225000), 0,
                                  ifelse((df_f$MARITAL_t==1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 225000), 0,
                                         ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t > 1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 425000), 0,
                                                ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t > 1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 425000), 0, 1))))))

df_f <- df_f %>% filter(df_f$drop==0) %>% select(-c(EX_N_PC,drop, MARITAL_t))
write.csv(df_f,"df_f.csv", row.names = FALSE)

### 8 Create imputed value of EIP3 ###

# Imprt imputed EIP data
imput_eip <- read_excel("Raw data/imputed_eip.xlsx")

#### 8.1 Restrict df_f to only observations on the 2105 or 2106 interview cycle 
df_f_imp <- df_f %>% group_by(ID) %>% filter(any(YYMM == 2105 | YYMM == 2106 | YYMM == 2008 | YYMM == 2009 | 
                                                 YYMM == 2011 | YYMM == 2012 | YYMM == 2102 | YYMM == 2103 | 
                                                 YYMM == 2108 | YYMM == 2109 | YYMM == 2111 | YYMM == 2112)) 

#### 8.2 Mege df_f_imp with imputed values in IMPUT_EIP.xlsx
####    These imputations were created using tax unit level data on AGI and number of dependents internal to the BLS 
df_f_imp <- merge(df_f_imp, imput_eip, by.x="NEWID", by.y = "newid")


#### 8.3 Create imputed value for EIPIII_t
df_f_imp <- df_f_imp %>% mutate(imp_eip3t = ifelse(EIPIII_t == imp_eip3_1, imp_eip3_1,
                                                   ifelse(EIPIII_t == imp_eip3_2, imp_eip3_2,
                                                          ifelse(EIPIII_t == imp_eip3_3, imp_eip3_3,
                                                                 ifelse(EIPIII_t == imp_eip3_4, imp_eip3_4,0)))))

df_f_imp <- df_f_imp %>% mutate(imp_eip3t = ifelse((YYMM == 2105 | YYMM == 2106) & EIPIII_t == 0,
                                                   ifelse(INTERI == 1,imp_eip3_1,
                                                          ifelse(INTERI == 2,imp_eip3_2,
                                                                 ifelse(INTERI == 3,imp_eip3_3,
                                                                        ifelse(INTERI == 4,imp_eip3_4,-100)))),imp_eip3t))

df_f_imp <- df_f_imp %>% mutate(imp_eip3t = ifelse((YYMM == 2105 | YYMM == 2106) & EIPIII_t > 0 & imp_eip3t == 0,
                                                   ifelse(INTERI == 1,imp_eip3_1,
                                                          ifelse(INTERI == 2,imp_eip3_2,
                                                                 ifelse(INTERI == 3,imp_eip3_3,
                                                                        ifelse(INTERI == 4,imp_eip3_4,-200)))),imp_eip3t))

#### 8.4 Create imputed value for EIPIII_tm1
df_f_imp <- df_f_imp %>% group_by(ID) %>% arrange(YYMM, .by_group = TRUE) %>%
  mutate(imp_eip3tm1 = lag(imp_eip3t, n=1, default = 0),
         imp_eip3tm2 = lag(imp_eip3t, n=2, default = 0),
         imp_eip3tm3 = lag(imp_eip3t, n=3, default = 0))

#### 8.5 Assign lagged imputed EIPIII value to CUs without observation in 2105 or 2006
df_f_imp <- df_f_imp %>% mutate(imp_eip3tm1 = ifelse((YYMM == 2108 | YYMM == 2109) & imp_eip3tm1 == 0,
                                                     ifelse(INTERI == 1, NA,
                                                            ifelse(INTERI == 2, imp_eip3_1,
                                                                   ifelse(INTERI == 3, imp_eip3_2,
                                                                          ifelse(INTERI == 4, imp_eip3_3,-300)))),imp_eip3tm1))


df_f_imp <- df_f_imp %>% mutate(imp_eip3tm2 = ifelse((YYMM == 2111 | YYMM == 2112) & imp_eip3tm2 == 0,
                                                     ifelse(INTERI == 1, NA,
                                                            ifelse(INTERI == 2, NA,
                                                                   ifelse(INTERI == 3, imp_eip3_1,
                                                                          ifelse(INTERI == 4, imp_eip3_2,-300)))),imp_eip3tm2))

#### 8.6 Create categorical representation of observed and imputed EIP value 
df_f_imp <- df_f_imp %>% mutate(eip3t_cat = ifelse(EIPIII_t > 0 & EIPIII_t < 1400,1,
                                                   ifelse(EIPIII_t == 1400,2,
                                                          ifelse(EIPIII_t > 1400 & EIPIII_t < 2800,3,
                                                                 ifelse(EIPIII_t == 2800,4,
                                                                        ifelse(EIPIII_t > 2800 & EIPIII_t < 4200,5,
                                                                               ifelse(EIPIII_t == 4200,6,
                                                                                      ifelse(EIPIII_t > 4200 & EIPIII_t < 5600,7,
                                                                                             ifelse(EIPIII_t == 5600,8,
                                                                                                    ifelse(EIPIII_t > 5600,9,0))))))))))

df_f_imp <- df_f_imp %>% mutate(impeip3t_cat = ifelse(imp_eip3t > 0 & imp_eip3t < 1400,1,
                                                      ifelse(imp_eip3t == 1400,2,
                                                             ifelse(imp_eip3t > 1400 & imp_eip3t < 2800,3,
                                                                    ifelse(imp_eip3t == 2800,4,
                                                                           ifelse(imp_eip3t > 2800 & imp_eip3t < 4200,5,
                                                                                  ifelse(imp_eip3t == 4200,6,
                                                                                         ifelse(imp_eip3t > 4200 & imp_eip3t < 5600,7,
                                                                                                ifelse(imp_eip3t == 5600,8,
                                                                                                       ifelse(imp_eip3t > 5600,9,0))))))))))

df_f_imp <- df_f_imp %>% select(-c(imp_eip2_1,imp_eip2_2,imp_eip2_3,imp_eip2_4,imp_eip3_1,imp_eip3_2,imp_eip3_3,imp_eip3_4))
write.csv(df_f_imp,"df_f_imp.csv", row.names = FALSE)

# Keep only df, all CU sample, the final sample, and the final sample with imputations
rm(list=setdiff(ls(), c("df","df_all_cu","df_f","df_f_imp")))

